diff --git a/convert.py b/convert.py
index 693f31a8f..31c96e5ad 100755
--- a/convert.py
+++ b/convert.py
@@ -24,13 +24,15 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
+import importlib
+gguf = importlib.import_module("gguf-py.gguf")
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 
 if 'NO_LOCAL_GGUF' not in os.environ:
     sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
+# import gguf
 
 if TYPE_CHECKING:
     from typing_extensions import Self, TypeAlias
@@ -47,15 +49,6 @@ DEFAULT_CONCURRENCY = 8
 ADDED_TOKENS_FILE = 'added_tokens.json'
 FAST_TOKENIZER_FILE = 'tokenizer.json'
 
-LLM_KV_SPLIT_NO = "split.no"
-LLM_KV_SPLIT_COUNT = "split.count"
-LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
-SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
-
-SPLIT_STYLE_NONE = 0
-SPLIT_STYLE_BY_TENSORS = 1
-SPLIT_STYLE_BY_SIZE = 2
-
 #
 # data types
 #
@@ -1066,8 +1059,8 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+    def __init__(self, fname_out: Path, args: argparse.Namespace, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
+        self.gguf = gguf.GGUFManager(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], args, endianess=endianess)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "LLaMA"
@@ -1146,21 +1139,15 @@ class OutputFile:
     def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
         svocab.add_to_gguf(self.gguf)
 
-    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
-        n_elements = int(np.prod(tensor.shape))
-        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
-        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
-        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
-
     def write_meta(self) -> None:
-        self.gguf.write_header_to_file()
-        self.gguf.write_kv_data_to_file()
+        self.gguf.write_to_file(meta_only=True)
 
-    def write_tensor_info(self) -> None:
-        self.gguf.write_ti_data_to_file()
+    def write_tensors(self, ftype: GGMLFileType, concurrency: int) -> None:
+        self.gguf.write_to_file(ftype=ftype, concurrency=concurrency, write_tensor_data=OutputFile.write_tensor_data)
 
-    def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
+    # really awkward with how this is managed with gguf_manager.py: maybe refactor at some point?
+    @staticmethod
+    def write_tensor_data(ftype: GGMLFileType, model: LazyModel, concurrency: int, writer: gguf.GGUFWriter) -> None:
         ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
         if ftype == GGMLFileType.MostlyQ8_0:
             ndarrays = bounded_parallel_map(
@@ -1178,7 +1165,7 @@ class OutputFile:
             print(
                 f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
             )
-            self.gguf.write_tensor_data(ndarray)
+            writer.write_tensor_data(ndarray)
 
     def close(self) -> None:
         self.gguf.close()
@@ -1217,156 +1204,26 @@ class OutputFile:
     @staticmethod
     def write_all(
         fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
-        tensors_per_shard: int, tensors_max_size: int, dry_run: bool = False, concurrency: int = DEFAULT_CONCURRENCY,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, small_first_shard: bool = True,
+        args: argparse.Namespace, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
     ) -> None:
-        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab=args.pad_vocab)
+        of = OutputFile(fname_out, args, endianess=endianess)
 
-        total_tensors = len(model)
-        total_size = sum(get_tensor_size(lazy_tensor) for lazy_tensor in model.values())
+        # meta data
+        of.add_meta_arch(params)
+        if isinstance(vocab, Vocab):
+            of.add_meta_vocab(vocab)
+            of.add_meta_special_vocab(svocab)
+        else:  # NoVocab
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
 
-        if tensors_per_shard:
-            split_style = SPLIT_STYLE_BY_TENSORS
-        elif tensors_max_size:
-            split_style = SPLIT_STYLE_BY_SIZE
-        else:
-            split_style = SPLIT_STYLE_NONE
+        # tensor info
+        for name, lazy_tensor in model.items():
+            of.gguf.add_tensor_info(name, lazy_tensor)
 
-        if tensors_per_shard and total_tensors < tensors_per_shard:
-            print("Model has fewer tensors than the split threshold, not splitting")
-            split_style = SPLIT_STYLE_NONE
+        of.write_tensors(ftype, concurrency)
 
-        if tensors_max_size and total_size < tensors_max_size:
-            print("Model has smaller size than the split threshold, not splitting")
-            split_style = SPLIT_STYLE_NONE
-
-        split_strategy = create_split_strategy(split_style, fname_out, model, tensors_per_shard, tensors_max_size, small_first_shard)
-        total_shards = len(split_strategy)
-
-        print("Writing the following files:")
-        for shard_path, shard_tensors in split_strategy:
-            size = format_n_bytes_to_str(sum(get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only"
-            print(f"  {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}")
-
-        if dry_run:
-            print("Dry run, not writing files")
-            return
-
-        for i, (shard_path, shard_tensors) in enumerate(split_strategy):
-            of = OutputFile(shard_path, endianess=endianess)
-
-            if i == 0:
-                # meta data
-                of.add_meta_arch(params)
-                if isinstance(vocab, Vocab):
-                    of.add_meta_vocab(vocab)
-                    of.add_meta_special_vocab(svocab)
-                else:  # NoVocab
-                    of.gguf.add_tokenizer_model(vocab.tokenizer_model)
-
-            # have the option to write a first shard with only the metadata
-            if split_style != SPLIT_STYLE_NONE:
-
-                of.gguf.add_uint16(LLM_KV_SPLIT_NO, i)
-                of.gguf.add_uint16(LLM_KV_SPLIT_COUNT, total_shards)
-                of.gguf.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, total_tensors)
-
-                if small_first_shard and i == 0:
-                    of.write_meta()
-                    of.close()
-                    continue
-
-                print(f"Writing shard {i + 1}/{total_shards} with {len(shard_tensors)} tensors")
-
-            # tensor info
-            for name, lazy_tensor in shard_tensors:
-                of.add_tensor_info(name, lazy_tensor)
-
-            of.write_meta()
-            of.write_tensor_info()
-            of.write_tensor_data(ftype, dict(shard_tensors), concurrency)
-
-            of.close()
-
-
-def split_str_to_n_bytes(split_str: str) -> int:
-    if split_str.endswith("K"):
-        n = int(split_str[:-1]) * 1024
-    elif split_str.endswith("M"):
-        n = int(split_str[:-1]) * 1024 * 1024
-    elif split_str.endswith("G"):
-        n = int(split_str[:-1]) * 1024 * 1024 * 1024
-    elif split_str.isnumeric():
-        n = int(split_str)
-    else:
-        raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
-
-    if n <= 0:
-        raise ValueError(f"Invalid split size: {split_str}, must be positive")
-
-    return n
-
-
-def format_n_bytes_to_str(num: int) -> str:
-    num = float(num)
-    for unit in ("", "K", "M", "G"):
-        if abs(num) < 1024.0:
-            return f"{num:3.1f}{unit}"
-        num /= 1024.0
-    return f"{num:.1f}T - over 1TB, --split recommended"
-
-
-def get_tensor_size(tensor: LazyTensor) -> int:
-    return tensor.data_type.elements_to_bytes(np.prod(tensor.shape))
-
-
-SplitStrategy: TypeAlias = 'list[tuple[Path, list[tuple[str, LazyTensor]]]]'
-
-
-def create_split_strategy(split_style: int, fname_out: Path, model: LazyModel, tensors_per_shard: int, tensors_max_size: int, small_first_shard: bool) -> SplitStrategy:
-    if split_style == SPLIT_STYLE_NONE:
-        return [(fname_out, list(model.items()))]
-
-    elif split_style == SPLIT_STYLE_BY_TENSORS:
-        total_shards = math.ceil(len(model) / tensors_per_shard) + small_first_shard
-        shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)]
-        splits = []
-
-        if small_first_shard:
-            splits.append((shard_files[0], None))
-
-        for i, shard in enumerate(shard_files[small_first_shard:]):
-            start = i * tensors_per_shard
-            stop = min((i + 1) * tensors_per_shard, len(model))
-            splits.append((shard, list(model.items())[start:stop]))
-
-        return splits
-
-    elif split_style == SPLIT_STYLE_BY_SIZE:
-        shards = []
-
-        # we have to determine the shards first to determine how many shards there will be in total - two passes
-        for i, shard in enumerate(list(model.items())):
-            if i == 0:
-                shards.append([shard])
-                continue
-            if get_tensor_size(shard[1]) + sum(get_tensor_size(t[1]) for t in shards[-1]) > tensors_max_size:
-                shards.append([shard])
-            else:
-                shards[-1].append(shard)
-
-        total_shards = len(shards) + small_first_shard
-        shard_offset = 1
-        splits = []
-
-        if small_first_shard:
-            splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards)), None))
-            shard_offset += 1
-
-        for i, shard in enumerate(shards):
-            splits.append((fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards)), shard))
-
-        return splits
+        of.close()
 
 
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
@@ -1607,8 +1464,8 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
     parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
     parser.add_argument("--split", action="store_true", help="split the converted model into multiple files")
-    parser.add_argument("--split-max-tensors", type=int, help=f"max tensors in each split")
-    parser.add_argument("--split-max-size", type=str, help=f"max size per split")
+    parser.add_argument("--split-max-tensors", type=int, help="max tensors in each split")
+    parser.add_argument("--split-max-size", type=str, help="max size per split N(M|G)+")
     parser.add_argument("--dry-run", action="store_true", help="only print out a split plan and exit, without writing any new files")
     parser.add_argument("--large-first-shard", action="store_true", help="include tensors in the first shard when splitting (default: metadata only)")
 
@@ -1628,7 +1485,7 @@ def main(args_in: list[str] | None = None) -> None:
         raise ValueError("Can't specify both --split-max-tensors and --split-max-size")
 
     if args.split_max_size:
-        args.split_max_size = split_str_to_n_bytes(args.split_max_size)
+        args.split_max_size = gguf.SplitStrategy.split_str_to_n_bytes(args.split_max_size)
 
     if not args.vocab_only:
         model_plus = load_some_model(args.model)
@@ -1693,9 +1550,8 @@ def main(args_in: list[str] | None = None) -> None:
     params.ftype = ftype
 
     print(f"Writing {outfile}, format {ftype}")
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args.split_max_tensors,
-                         args.split_max_size, dry_run=args.dry_run, concurrency=args.concurrency,
-                         endianess=endianess, pad_vocab=args.pad_vocab, small_first_shard=not args.large_first_shard)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, args,
+                         concurrency=args.concurrency, endianess=endianess)
     if not args.dry_run:
         print(f"Wrote {outfile}")
 
diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py
index 110ab342c..bd904fa2a 100644
--- a/gguf-py/gguf/__init__.py
+++ b/gguf-py/gguf/__init__.py
@@ -1,5 +1,6 @@
 from .constants import *
 from .gguf_reader import *
 from .gguf_writer import *
+from .gguf_manager import *
 from .tensor_mapping import *
 from .vocab import *
diff --git a/gguf-py/gguf/gguf_manager.py b/gguf-py/gguf/gguf_manager.py
new file mode 100644
index 000000000..b1e680810
--- /dev/null
+++ b/gguf-py/gguf/gguf_manager.py
@@ -0,0 +1,523 @@
+from __future__ import annotations
+
+import os
+import shutil
+import struct
+import tempfile
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any, Sequence, Mapping
+from string import ascii_letters, digits
+from argparse import Namespace
+from math import ceil
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+from .constants import (
+    GGMLQuantizationType,
+    GGUFEndian,
+    GGUFValueType,
+    Keys,
+    RopeScalingType,
+    PoolingType,
+    TokenType,
+)
+from .gguf_writer import GGUFWriter
+
+
+SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
+
+LLM_KV_SPLIT_NO = "split.no"
+LLM_KV_SPLIT_COUNT = "split.count"
+LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
+
+SplitTensorsPerFile: TypeAlias = list[tuple[os.PathLike[str], list[tuple[str, Any]], GGUFWriter]] # [(outfile name, [(tensor name, tensor data)] for each tensor in file, filewriter)]
+KVTempData: TypeAlias = dict[str, tuple[Any, GGUFValueType]] # {key: (value, type)}
+TensorTempData: TypeAlias = tuple[str, np.ndarray[Any, Any]] # (tensor name, tensor data), aka LazyModel
+
+
+class SplitStyle(IntEnum):
+    NONE = 0
+    TENSORS = 1
+    SIZE = 2
+
+
+class SplitStrategy:
+    data: SplitTensorsPerFile
+
+    def __init__(self, split_style: SplitStyle, fname_out: os.PathLike[str], model: list[TensorTempData],
+                 args: Namespace, arch: str, use_temp_file: bool = True, endianess: GGUFEndian = GGUFEndian.LITTLE, small_first_shard: bool = True
+    ):
+        self.data = []
+
+        if split_style == SplitStyle.NONE:
+            self.append((fname_out, model, GGUFWriter(fname_out, arch, use_temp_file=use_temp_file, endianess=endianess)))
+
+        elif split_style == SplitStyle.TENSORS:
+            total_shards = ceil(len(model) / args.split_max_tensors) + small_first_shard
+            shard_files = [fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + 1, total_shards)) for i in range(total_shards)]
+
+            if small_first_shard:
+                self.append((shard_files[0], None, GGUFWriter(shard_files[0], arch, use_temp_file=use_temp_file, endianess=endianess)))
+
+            for i, shard in enumerate(shard_files[small_first_shard:]):
+                start = i * args.split_max_tensors
+                stop = min((i + 1) * args.split_max_tensors, len(model))
+                self.append((shard, model[start:stop], GGUFWriter(shard, arch, use_temp_file=use_temp_file, endianess=endianess)))
+
+        elif split_style == SplitStyle.SIZE:
+            shards = []
+
+            # we have to determine the shards first to determine how many shards there will be in total - two passes
+            for i, shard in enumerate(model):
+                if i == 0:
+                    shards.append([shard])
+                    continue
+                if SplitStrategy.get_tensor_size(shard[1]) + sum(SplitStrategy.get_tensor_size(t[1]) for t in shards[-1]) > args.split_max_size:
+                    shards.append([shard])
+                else:
+                    shards[-1].append(shard)
+
+            total_shards = len(shards) + small_first_shard
+            shard_offset = 1
+
+            if small_first_shard:
+                outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, shard_offset, total_shards))
+                self.append((outname, None, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess)))
+                shard_offset += 1
+
+            for i, shard in enumerate(shards):
+                outname = fname_out.with_name(SHARD_NAME_FORMAT.format(fname_out.stem, i + shard_offset, total_shards))
+                self.append((outname, shard, GGUFWriter(outname, arch, use_temp_file=use_temp_file, endianess=endianess)))
+
+    def __getitem__(self, index):
+        return self.data[index]
+    
+    def __setitem__(self, index, value):
+        self.data[index] = value
+
+    def __len__(self):
+        return len(self.data)
+
+    def append(self, value: TensorTempData):
+        self.data.append(value)
+
+    def remove(self, item: TensorTempData):
+        self.data.remove(item)
+
+    @staticmethod
+    def get_tensor_size(tensor) -> int:
+        # we don't have the LazyTensor class here from convert.py but we can try
+        try:
+            return tensor.data_type.elements_to_bytes(np.prod(tensor.shape))
+        except AttributeError: # numpy ndarray[Any, Any]
+            return tensor.nbytes
+        except: # this should never happen
+            raise ValueError(f"Invalid tensor type: {type(tensor)}")
+    
+    @staticmethod
+    def split_str_to_n_bytes(split_str: str) -> int:
+        if split_str.endswith("K"):
+            n = int(split_str[:-1]) * 1024
+        elif split_str.endswith("M"):
+            n = int(split_str[:-1]) * 1024 * 1024
+        elif split_str.endswith("G"):
+            n = int(split_str[:-1]) * 1024 * 1024 * 1024
+        elif split_str.isnumeric():
+            n = int(split_str)
+        else:
+            raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
+
+        if n <= 0:
+            raise ValueError(f"Invalid split size: {split_str}, must be positive")
+
+        return n
+
+    @staticmethod
+    def format_n_bytes_to_str(num: int) -> str:
+        num = float(num)
+        for unit in ("", "K", "M", "G"):
+            if abs(num) < 1024.0:
+                return f"{num:3.1f}{unit}"
+            num /= 1024.0
+        return f"{num:.1f}T - over 1TB, --split recommended"
+
+
+# ideally this has most of the same signatures as GGUFWriter so it's nearly a drop-in replacement
+class GGUFManager:
+    kv_data: KVTempData
+    tensors: list[TensorTempData]
+    split_style: SplitStyle
+    split_strategy: SplitStrategy
+
+    def __init__(self, path: os.PathLike[str] | str, arch: str, args: Namespace, use_temp_file: bool = True,
+                 endianess: GGUFEndian = GGUFEndian.LITTLE) -> None:
+        self.arch = arch
+        self.path = path
+        self.endianess = endianess
+        self.offset_tensor = 0
+        self.kv_data = {}
+        self.tensors = []
+        self.args = args
+        self.split_style = SplitStyle.NONE if not args.split \
+            else SplitStyle.TENSORS if args.split_max_tensors \
+            else SplitStyle.SIZE
+        self.split_strategy = None
+        self.total_shards = None
+        self.total_tensors = None
+        self.use_temp_file = use_temp_file
+
+        self.add_architecture()
+
+    # have to consolidate because we need to know kv data count and tensor count before we can write the header
+    # and we need to write tensor info before we can write metadata
+    # these all kinda show up around the same places anyway so it's not a huge deal?
+    def write_to_file(self, meta_only: bool = False, ftype: int = 0, concurrency: int = 8, write_tensor_data: function = None) -> None:
+
+        # here is the first place you can assume you have all tensors written and you can establish the size of the file - so logic goes here
+        self.total_tensors = len(self.tensors)
+        total_size = sum(SplitStrategy.get_tensor_size(tensor[1]) for tensor in self.tensors)
+
+        if self.args.split_max_tensors and self.total_tensors < self.args.split_max_tensors:
+            print("Model has fewer tensors than the split threshold, not splitting")
+            self.split_style = SplitStyle.NONE
+
+        if self.args.split_max_size and total_size < self.args.split_max_size:
+            print("Model has smaller size than the split threshold, not splitting")
+            self.split_style = SplitStyle.NONE
+
+        self.split_strategy = SplitStrategy(self.split_style, self.path, self.tensors, self.args, not self.args.large_first_shard)
+        self.total_shards = len(self.split_strategy)
+
+        # only the first shard needs all the KV data
+        for key, (value, etype) in self.kv_data.items():
+            self.split_strategy[0][2].add_key(key)
+            self.split_strategy[0][2].add_val(value, etype)
+
+        if self.split_style != SplitStyle.NONE:
+            for i, (_, _, writer) in enumerate(self.split_strategy):
+                writer.add_uint16(LLM_KV_SPLIT_NO, i)
+                writer.add_uint16(LLM_KV_SPLIT_COUNT, self.total_shards)
+                writer.add_int32(LLM_KV_SPLIT_TENSORS_COUNT, self.total_tensors)
+
+        # metadata/vocab only can write and return here
+        if meta_only:
+            for i, (_, _, writer) in enumerate(self.split_strategy):
+                writer.write_header_to_file()
+                writer.write_kv_data_to_file()
+            return
+        
+        # tensor writing code starts here
+
+        print("\nWriting the following files:")
+        for (shard_path, shard_tensors, _) in self.split_strategy:
+            size = SplitStrategy.format_n_bytes_to_str(sum(SplitStrategy.get_tensor_size(t[1]) for t in shard_tensors)) if shard_tensors else "negligible - metadata only"
+            print(f"  {shard_path}: n_tensors = {len(shard_tensors) if shard_tensors else 0}, total_size = {size}")
+
+        if self.args.dry_run:
+            print("\nDry run, not writing files")
+            return
+
+        # run add_tensor_info, write data, then write_tensor_data - taken from convert.py
+        running_total = self.total_tensors
+        for i, (_, tensors, writer) in enumerate(self.split_strategy):
+
+            if tensors:
+                for name, tensor in tensors:
+                    n_elements = int(np.prod(tensor.shape))
+                    raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
+                    data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
+                    data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
+                    writer.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
+
+            writer.write_header_to_file()
+            writer.write_kv_data_to_file()
+            writer.write_tensors_to_file()
+
+            if tensors:
+                print(f"\nWriting to shard {i + 1}/{self.total_shards} with {len(tensors)}/{running_total} remaining tensors (of {self.total_tensors} total)")
+                running_total -= len(tensors)
+
+                # convert.py's write_tensor_data is dependent on so many objects in convert.py itself that it's easier to pass the function as a parameter and call it here
+                write_tensor_data(ftype, dict(tensors), concurrency, writer)
+
+    def add_uint8(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.UINT8)
+
+    def add_int8(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.INT8)
+
+    def add_uint16(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.UINT16)
+
+    def add_int16(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.INT16)
+
+    def add_uint32(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.UINT32)
+
+    def add_int32(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.INT32)
+
+    def add_float32(self, key: str, val: float) -> None:
+        self.kv_data[key] = (val, GGUFValueType.FLOAT32)
+
+    def add_uint64(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.UINT64)
+
+    def add_int64(self, key: str, val: int) -> None:
+        self.kv_data[key] = (val, GGUFValueType.INT64)
+
+    def add_float64(self, key: str, val: float) -> None:
+        self.kv_data[key] = (val, GGUFValueType.FLOAT64)
+
+    def add_bool(self, key: str, val: bool) -> None:
+        self.kv_data[key] = (val, GGUFValueType.BOOL)
+
+    def add_string(self, key: str, val: str) -> None:
+        if not val:
+            return
+        self.kv_data[key] = (val, GGUFValueType.STRING)
+
+    def add_array(self, key: str, val: Sequence[Any]) -> None:
+        if not isinstance(val, Sequence):
+            raise ValueError(f'Expected a sequence for {key}, got {type(val)}')
+        self.kv_data[key] = (val, GGUFValueType.ARRAY)
+
+    # this method is exclusive to convert.py - we don't have LazyTensor so Any type is used
+    def add_tensor_info(self, name: str, tensor: Any) -> None:
+        self.tensors.append((name, tensor))
+
+    # these methods are everywhere but convert.py (and convert-lora-to-ggml.py since that doesn't use the class)
+    def add_tensor(
+        self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
+        raw_dtype: GGMLQuantizationType | None = None,
+    ) -> None:
+        # TODO WRITE
+        pass
+
+    def write_tensors_to_file(self) -> None:
+        # TODO WRITE
+        pass
+
+    def close(self) -> None:
+        for _, _, writer in self.split_strategy:
+            writer.close()
+
+    def add_architecture(self) -> None:
+        self.add_string(Keys.General.ARCHITECTURE, self.arch)
+
+    def add_author(self, author: str) -> None:
+        self.add_string(Keys.General.AUTHOR, author)
+
+    def add_version(self, version: str) -> None:
+        self.add_string(Keys.General.VERSION, version)
+
+    def add_tensor_data_layout(self, layout: str) -> None:
+        self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_url(self, url: str) -> None:
+        self.add_string(Keys.General.URL, url)
+
+    def add_description(self, description: str) -> None:
+        self.add_string(Keys.General.DESCRIPTION, description)
+
+    def add_licence(self, licence: str) -> None:
+        self.add_string(Keys.General.LICENSE, licence)
+
+    def add_source_url(self, url: str) -> None:
+        self.add_string(Keys.General.SOURCE_URL, url)
+
+    def add_source_hf_repo(self, repo: str) -> None:
+        self.add_string(Keys.General.SOURCE_HF_REPO, repo)
+
+    def add_file_type(self, ftype: int) -> None:
+        self.add_uint32(Keys.General.FILE_TYPE, ftype)
+
+    def add_name(self, name: str) -> None:
+        self.add_string(Keys.General.NAME, name)
+
+    def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
+        self.add_uint32(Keys.General.QUANTIZATION_VERSION, quantization_version)
+
+    def add_custom_alignment(self, alignment: int) -> None:
+        self.data_alignment = alignment
+        self.add_uint32(Keys.General.ALIGNMENT, alignment)
+
+    def add_vocab_size(self, size: int) -> None:
+        self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
+
+    def add_context_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
+
+    def add_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_feed_forward_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_parallel_residual(self, use: bool) -> None:
+        self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
+
+    def add_head_count(self, count: int) -> None:
+        self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)
+
+    def add_head_count_kv(self, count: int) -> None:
+        self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count)
+
+    def add_key_length(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.KEY_LENGTH.format(arch=self.arch), length)
+
+    def add_value_length(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
+
+    def add_max_alibi_bias(self, bias: float) -> None:
+        self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
+
+    def add_clamp_kqv(self, value: float) -> None:
+        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
+
+    def add_logit_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
+
+    def add_expert_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
+
+    def add_expert_used_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
+
+    def add_layer_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
+
+    def add_layer_norm_rms_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
+
+    def add_causal_attention(self, value: bool) -> None:
+        self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
+
+    def add_pooling_type(self, value: PoolingType) -> None:
+        self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
+
+    def add_rope_dimension_count(self, count: int) -> None:
+        self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
+
+    def add_rope_freq_base(self, value: float) -> None:
+        self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
+
+    def add_rope_scaling_type(self, value: RopeScalingType) -> None:
+        self.add_string(Keys.Rope.SCALING_TYPE.format(arch=self.arch), value.value)
+
+    def add_rope_scaling_factor(self, value: float) -> None:
+        self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
+
+    def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
+        self.add_uint32(Keys.Rope.SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
+
+    def add_rope_scaling_finetuned(self, value: bool) -> None:
+        self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
+
+    def add_ssm_conv_kernel(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
+
+    def add_ssm_inner_size(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value)
+
+    def add_ssm_state_size(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value)
+
+    def add_ssm_time_step_rank(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
+
+    def add_tokenizer_model(self, model: str) -> None:
+        self.add_string(Keys.Tokenizer.MODEL, model)
+
+    def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
+        self.add_array(Keys.Tokenizer.LIST, tokens)
+
+    def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
+        self.add_array(Keys.Tokenizer.MERGES, merges)
+
+    def add_token_types(self, types: Sequence[TokenType] | Sequence[int]) -> None:
+        self.add_array(Keys.Tokenizer.TOKEN_TYPE, types)
+
+    def add_token_type_count(self, value: int) -> None:
+        self.add_uint32(Keys.Tokenizer.TOKEN_TYPE_COUNT, value)
+
+    def add_token_scores(self, scores: Sequence[float]) -> None:
+        self.add_array(Keys.Tokenizer.SCORES, scores)
+
+    def add_bos_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.BOS_ID, id)
+
+    def add_eos_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOS_ID, id)
+
+    def add_unk_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.UNK_ID, id)
+
+    def add_sep_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.SEP_ID, id)
+
+    def add_pad_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.PAD_ID, id)
+
+    def add_cls_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.CLS_ID, id)
+
+    def add_mask_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.MASK_ID, id)
+
+    def add_add_bos_token(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_BOS, value)
+
+    def add_add_eos_token(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_EOS, value)
+
+    def add_add_space_prefix(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
+
+    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
+        if isinstance(value, list):
+            template_default = None
+            template_names = set()
+
+            for choice in value:
+                name = choice.get('name', '')
+                template = choice.get('template')
+
+                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
+                name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
+
+                if name and template is not None:
+                    if name == 'default':
+                        template_default = template
+                    else:
+                        template_names.add(name)
+                        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
+
+            if template_names:
+                self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
+
+            if template_default is None:
+                return
+
+            value = template_default
+
+        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
+
+    def add_prefix_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
+
+    def add_suffix_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
+
+    def add_middle_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
+
+    def add_eot_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOT_ID, id)
\ No newline at end of file