Support split by size and dry run to write estimated shards/filesizes

2024-04-27 21:14:41 -04:00 · 2024-04-27 21:14:41 -04:00 · 72cbd4e014
commit 72cbd4e014
parent 874c3411c2
1 changed files with 146 additions and 67 deletions
--- a/convert.py
+++ b/convert.py
@ -44,8 +44,6 @@ ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8

-DEFAULT_SPLIT_TENSORS = 128
-
 ADDED_TOKENS_FILE = 'added_tokens.json'
 FAST_TOKENIZER_FILE = 'tokenizer.json'

@ -54,6 +52,10 @@ LLM_KV_SPLIT_COUNT = "split.count"
 LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
 SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"

+SPLIT_STYLE_NONE = 0
+SPLIT_STYLE_BY_TENSORS = 1
+SPLIT_STYLE_BY_SIZE = 2
+
 #
 # data types
 #
@ -1215,49 +1217,46 @@ class OutputFile:
    @staticmethod
    def write_all(
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool = False,
+        tensors_per_shard: int, tensors_max_size: int, dry_run: bool = False, concurrency: int = DEFAULT_CONCURRENCY,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, small_first_shard: bool = True,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

-        of = OutputFile(fname_out, endianess=endianess)
+        total_tensors = len(model)
+        total_size = sum(get_tensor_size(lazy_tensor) for lazy_tensor in model.values())

-        # meta data
-        of.add_meta_arch(params)
-        if isinstance(vocab, Vocab):
-            of.add_meta_vocab(vocab)
-            of.add_meta_special_vocab(svocab)
-        else:  # NoVocab
-            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
+        if tensors_per_shard:
+            split_style = SPLIT_STYLE_BY_TENSORS
+        elif tensors_max_size:
+            split_style = SPLIT_STYLE_BY_SIZE
+        else:
+            split_style = SPLIT_STYLE_NONE

-        # tensor info
-        for name, lazy_tensor in model.items():
-            of.add_tensor_info(name, lazy_tensor)
+        if tensors_per_shard and total_tensors < tensors_per_shard:
+            print("Model has fewer tensors than the split threshold, not splitting")
+            split_style = SPLIT_STYLE_NONE

-        of.write_meta()
-        of.write_tensor_info()
+        if tensors_max_size and total_size < tensors_max_size:
+            print("Model has smaller size than the split threshold, not splitting")
+            split_style = SPLIT_STYLE_NONE

-        # tensor data
-        of.write_tensor_data(ftype, model, concurrency)
+        split_strategy = create_split_strategy(split_style, fname_out, model, tensors_per_shard, tensors_max_size, small_first_shard)
+        total_shards = len(split_strategy)

-        of.close()
+        print("Writing the following files:")
+        for shard_path, shard_tensors in split_strategy:
+            size = format_n_bytes_to_str(sum(get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only"
+            print(f"  {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}")

-    @staticmethod
-    def write_split(
-        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
-        total_tensors: int, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool = False, tensors_per_shard: int = DEFAULT_SPLIT_TENSORS, small_first_shard: bool = True,
-    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+        if dry_run:
+            print("Dry run, not writing files")
+            return

-        model_list = list(model.items())
-        total_shards = math.ceil(total_tensors / tensors_per_shard) + small_first_shard
-        shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)]
-
-        for i, shard in enumerate(shard_files):
-            of = OutputFile(shard, endianess=endianess)
+        for i, (shard_path, shard_tensors) in enumerate(split_strategy):
+            of = OutputFile(shard_path, endianess=endianess)

            if i == 0:
+                # meta data
                of.add_meta_arch(params)
                if isinstance(vocab, Vocab):
                    of.add_meta_vocab(vocab)
@ -1265,27 +1264,111 @@ class OutputFile:
                else:  # NoVocab
                    of.gguf.add_tokenizer_model(vocab.tokenizer_model)

-            of.gguf.add_uint16(LLM_KV_SPLIT_NO, i)
-            of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards)
-            of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors)
-
            # have the option to write a first shard with only the metadata
-            if small_first_shard and i == 0:
-                of.write_meta()
-                of.close()
-                continue
+            if split_style != SPLIT_STYLE_NONE:

-            stop = min((i + 1 - small_first_shard) * tensors_per_shard, total_tensors)
-            shard_models = model_list[(i - small_first_shard) * tensors_per_shard:stop]
-            for name, lazy_tensor in shard_models:
+                of.gguf.add_uint16(LLM_KV_SPLIT_NO, i)
+                of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards)
+                of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors)
+
+                if small_first_shard and i == 0:
+                    of.write_meta()
+                    of.close()
+                    continue
+
+                print(f"Writing shard {i + 1}/{total_shards} with {len(shard_tensors)} tensors")
+
+            # tensor info
+            for name, lazy_tensor in shard_tensors:
                of.add_tensor_info(name, lazy_tensor)

            of.write_meta()
            of.write_tensor_info()
-            of.write_tensor_data(ftype, dict(shard_models), concurrency)
+            of.write_tensor_data(ftype, dict(shard_tensors), concurrency)
+
            of.close()


+def split_str_to_n_bytes(split_str: str) -> int:
+    if split_str.endswith("K"):
+        n = int(split_str[:-1]) * 1024
+    elif split_str.endswith("M"):
+        n = int(split_str[:-1]) * 1024 * 1024
+    elif split_str.endswith("G"):
+        n = int(split_str[:-1]) * 1024 * 1024 * 1024
+    elif split_str.isnumeric():
+        n = int(split_str)
+    else:
+        raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
+
+    if n <= 0:
+        raise ValueError(f"Invalid split size: {split_str}, must be positive")
+
+    return n
+
+
+def format_n_bytes_to_str(num: int) -> str:
+    num = float(num)
+    for unit in ("", "K", "M", "G"):
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}"
+        num /= 1024.0
+    return f"{num:.1f}T - over 1TB, --split recommended"
+
+
+def get_tensor_size(tensor: LazyTensor) -> int:
+    return tensor.data_type.elements_to_bytes(np.prod(tensor.shape))
+
+
+SplitStrategy: TypeAlias = 'list[tuple[Path, list[tuple[str, LazyTensor]]]]'
+
+
+def create_split_strategy(split_style: int, fname_out: Path, model: LazyModel, tensors_per_shard: int, tensors_max_size: int, small_first_shard: bool) -> SplitStrategy:
+    if split_style == SPLIT_STYLE_NONE:
+        return [(fname_out, list(model.items()))]
+
+    elif split_style == SPLIT_STYLE_BY_TENSORS:
+        total_shards = math.ceil(len(model) / tensors_per_shard) + small_first_shard
+        shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)]
+        splits = []
+
+        if small_first_shard:
+            splits.append((shard_files[0], None))
+
+        for i, shard in enumerate(shard_files[small_first_shard:]):
+            start = i * tensors_per_shard
+            stop = min((i + 1) * tensors_per_shard, len(model))
+            splits.append((shard, list(model.items())[start:stop]))
+
+        return splits
+
+    elif split_style == SPLIT_STYLE_BY_SIZE:
+        shards = []
+
+        # we have to determine the shards first to determine how many shards there will be in total - two passes
+        for i, shard in enumerate(list(model.items())):
+            if i == 0:
+                shards.append([shard])
+                continue
+            if get_tensor_size(shard[1]) + sum(get_tensor_size(t[1]) for t in shards[-1]) > tensors_max_size:
+                shards.append([shard])
+            else:
+                shards[-1].append(shard)
+
+        total_shards = len(shards) + small_first_shard
+        shard_offset = 1
+        splits = []
+
+        if small_first_shard:
+            splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)), None))
+            shard_offset += 1
+
+        for i, shard in enumerate(shards):
+            splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)), shard))
+
+        return splits
+
+
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type

@ -1524,8 +1607,10 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--split", action="store_true", help="split the converted model into multiple files")
-    parser.add_argument("--split-max-tensors", type=int, help=f"maximum number of tensors per file when splitting (default: {DEFAULT_SPLIT_TENSORS})", default=DEFAULT_SPLIT_TENSORS)
-    parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default is to only include metadata)")
+    parser.add_argument("--split-max-tensors", type=int, help=f"max tensors in each split")
+    parser.add_argument("--split-max-size", type=str, help=f"max size per split")
+    parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files")
+    parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)")

    args = parser.parse_args(args_in)
    if args.no_vocab and args.vocab_only:
@ -1536,6 +1621,15 @@ def main(args_in: list[str] | None = None) -> None:
        do_dump_model(model_plus)
        return

+    if args.split and not (args.split_max_tensors or args.split_max_size):
+        raise ValueError("Need to specify one of --split-max-tensors or --split-max-size when splitting")
+
+    if args.split_max_tensors and args.split_max_size:
+        raise ValueError("Can't specify both --split-max-tensors and --split-max-size")
+
+    if args.split_max_size:
+        args.split_max_size = split_str_to_n_bytes(args.split_max_size)
+
    if not args.vocab_only:
        model_plus = load_some_model(args.model)
    else:
@ -1598,26 +1692,11 @@ def main(args_in: list[str] | None = None) -> None:

    params.ftype = ftype

-    if args.split:
-        total_tensors = len(model)
-        if total_tensors < args.split_max_tensors:
-
-            print("Model has fewer tensors than the split threshold, not splitting")
-            print(f"Writing {outfile}, format {ftype}")
-            OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                                 concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
-        else:
-            print(f"Writing {outfile} as shards, format {ftype}")
-            OutputFile.write_split(outfile, ftype, params, model, vocab, special_vocab, total_tensors,
-                                   concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab,
-                                   tensors_per_shard=args.split_max_tensors, small_first_shard=not args.large_first_shard)
-        print(f"Wrote {outfile}")
-
-    else:
-        print(f"Writing {outfile}, format {ftype}")
-
-        OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                             concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+    print(f"Writing {outfile}, format {ftype}")
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args.split_max_tensors,
+                         args.split_max_size, dry_run=args.dry_run, concurrency=args.concurrency,
+                         endianess=endianess, pad_vocab=args.pad_vocab, small_first_shard=not args.large_first_shard)
+    if not args.dry_run:
        print(f"Wrote {outfile}")