diff --git a/convert.py b/convert.py index ec9aa9a56..693f31a8f 100755 --- a/convert.py +++ b/convert.py @@ -44,8 +44,6 @@ ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 -DEFAULT_SPLIT_TENSORS = 128 - ADDED_TOKENS_FILE = 'added_tokens.json' FAST_TOKENIZER_FILE = 'tokenizer.json' @@ -54,6 +52,10 @@ LLM_KV_SPLIT_COUNT = "split.count" LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count" SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" +SPLIT_STYLE_NONE = 0 +SPLIT_STYLE_BY_TENSORS = 1 +SPLIT_STYLE_BY_SIZE = 2 + # # data types # @@ -1215,49 +1217,46 @@ class OutputFile: @staticmethod def write_all( fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, + tensors_per_shard: int, tensors_max_size: int, dry_run: bool = False, concurrency: int = DEFAULT_CONCURRENCY, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, small_first_shard: bool = True, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) - of = OutputFile(fname_out, endianess=endianess) + total_tensors = len(model) + total_size = sum(get_tensor_size(lazy_tensor) for lazy_tensor in model.values()) - # meta data - of.add_meta_arch(params) - if isinstance(vocab, Vocab): - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) - else: # NoVocab - of.gguf.add_tokenizer_model(vocab.tokenizer_model) + if tensors_per_shard: + split_style = SPLIT_STYLE_BY_TENSORS + elif tensors_max_size: + split_style = SPLIT_STYLE_BY_SIZE + else: + split_style = SPLIT_STYLE_NONE - # tensor info - for name, lazy_tensor in model.items(): - of.add_tensor_info(name, lazy_tensor) + if tensors_per_shard and total_tensors < tensors_per_shard: + print("Model has fewer tensors than the split threshold, not splitting") + split_style = SPLIT_STYLE_NONE - of.write_meta() - of.write_tensor_info() + if tensors_max_size and total_size < tensors_max_size: + print("Model has smaller size than the split threshold, not splitting") + split_style = SPLIT_STYLE_NONE - # tensor data - of.write_tensor_data(ftype, model, concurrency) + split_strategy = create_split_strategy(split_style, fname_out, model, tensors_per_shard, tensors_max_size, small_first_shard) + total_shards = len(split_strategy) - of.close() + print("Writing the following files:") + for shard_path, shard_tensors in split_strategy: + size = format_n_bytes_to_str(sum(get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only" + print(f" {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}") - @staticmethod - def write_split( - fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, - total_tensors: int, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, tensors_per_shard: int = DEFAULT_SPLIT_TENSORS, small_first_shard: bool = True, - ) -> None: - check_vocab_size(params, vocab, pad_vocab=pad_vocab) + if dry_run: + print("Dry run, not writing files") + return - model_list = list(model.items()) - total_shards = math.ceil(total_tensors / tensors_per_shard) + small_first_shard - shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] - - for i, shard in enumerate(shard_files): - of = OutputFile(shard, endianess=endianess) + for i, (shard_path, shard_tensors) in enumerate(split_strategy): + of = OutputFile(shard_path, endianess=endianess) if i == 0: + # meta data of.add_meta_arch(params) if isinstance(vocab, Vocab): of.add_meta_vocab(vocab) @@ -1265,27 +1264,111 @@ class OutputFile: else: # NoVocab of.gguf.add_tokenizer_model(vocab.tokenizer_model) - of.gguf.add_uint16(LLM_KV_SPLIT_NO, i) - of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards) - of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors) - # have the option to write a first shard with only the metadata - if small_first_shard and i == 0: - of.write_meta() - of.close() - continue + if split_style != SPLIT_STYLE_NONE: - stop = min((i + 1 - small_first_shard) * tensors_per_shard, total_tensors) - shard_models = model_list[(i - small_first_shard) * tensors_per_shard:stop] - for name, lazy_tensor in shard_models: + of.gguf.add_uint16(LLM_KV_SPLIT_NO, i) + of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards) + of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors) + + if small_first_shard and i == 0: + of.write_meta() + of.close() + continue + + print(f"Writing shard {i + 1}/{total_shards} with {len(shard_tensors)} tensors") + + # tensor info + for name, lazy_tensor in shard_tensors: of.add_tensor_info(name, lazy_tensor) of.write_meta() of.write_tensor_info() - of.write_tensor_data(ftype, dict(shard_models), concurrency) + of.write_tensor_data(ftype, dict(shard_tensors), concurrency) + of.close() +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1024 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1024 * 1024 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1024 * 1024 * 1024 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G") + + if n <= 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + +def format_n_bytes_to_str(num: int) -> str: + num = float(num) + for unit in ("", "K", "M", "G"): + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}" + num /= 1024.0 + return f"{num:.1f}T - over 1TB, --split recommended" + + +def get_tensor_size(tensor: LazyTensor) -> int: + return tensor.data_type.elements_to_bytes(np.prod(tensor.shape)) + + +SplitStrategy: TypeAlias = 'list[tuple[Path, list[tuple[str, LazyTensor]]]]' + + +def create_split_strategy(split_style: int, fname_out: Path, model: LazyModel, tensors_per_shard: int, tensors_max_size: int, small_first_shard: bool) -> SplitStrategy: + if split_style == SPLIT_STYLE_NONE: + return [(fname_out, list(model.items()))] + + elif split_style == SPLIT_STYLE_BY_TENSORS: + total_shards = math.ceil(len(model) / tensors_per_shard) + small_first_shard + shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)] + splits = [] + + if small_first_shard: + splits.append((shard_files[0], None)) + + for i, shard in enumerate(shard_files[small_first_shard:]): + start = i * tensors_per_shard + stop = min((i + 1) * tensors_per_shard, len(model)) + splits.append((shard, list(model.items())[start:stop])) + + return splits + + elif split_style == SPLIT_STYLE_BY_SIZE: + shards = [] + + # we have to determine the shards first to determine how many shards there will be in total - two passes + for i, shard in enumerate(list(model.items())): + if i == 0: + shards.append([shard]) + continue + if get_tensor_size(shard[1]) + sum(get_tensor_size(t[1]) for t in shards[-1]) > tensors_max_size: + shards.append([shard]) + else: + shards[-1].append(shard) + + total_shards = len(shards) + small_first_shard + shard_offset = 1 + splits = [] + + if small_first_shard: + splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)), None)) + shard_offset += 1 + + for i, shard in enumerate(shards): + splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)), shard)) + + return splits + + def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type @@ -1524,8 +1607,10 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--split", action="store_true", help="split the converted model into multiple files") - parser.add_argument("--split-max-tensors", type=int, help=f"maximum number of tensors per file when splitting (default: {DEFAULT_SPLIT_TENSORS})", default=DEFAULT_SPLIT_TENSORS) - parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default is to only include metadata)") + parser.add_argument("--split-max-tensors", type=int, help=f"max tensors in each split") + parser.add_argument("--split-max-size", type=str, help=f"max size per split") + parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files") + parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)") args = parser.parse_args(args_in) if args.no_vocab and args.vocab_only: @@ -1536,6 +1621,15 @@ def main(args_in: list[str] | None = None) -> None: do_dump_model(model_plus) return + if args.split and not (args.split_max_tensors or args.split_max_size): + raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting") + + if args.split_max_tensors and args.split_max_size: + raise ValueError("Can't specify both --split-max-tensors and --split-max-size") + + if args.split_max_size: + args.split_max_size = split_str_to_n_bytes(args.split_max_size) + if not args.vocab_only: model_plus = load_some_model(args.model) else: @@ -1598,26 +1692,11 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype - if args.split: - total_tensors = len(model) - if total_tensors < args.split_max_tensors: - - print("Model has fewer tensors than the split threshold, not splitting") - print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - else: - print(f"Writing {outfile} as shards, format {ftype}") - OutputFile.write_split(outfile, ftype, params, model, vocab, special_vocab, total_tensors, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, - tensors_per_shard=args.split_max_tensors, small_first_shard=not args.large_first_shard) - print(f"Wrote {outfile}") - - else: - print(f"Writing {outfile}, format {ftype}") - - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, - concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) + print(f"Writing {outfile}, format {ftype}") + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args.split_max_tensors, + args.split_max_size, dry_run=args.dry_run, concurrency=args.concurrency, + endianess=endianess, pad_vocab=args.pad_vocab, small_first_shard=not args.large_first_shard) + if not args.dry_run: print(f"Wrote {outfile}")