convert*.py: inline source uuid generation approach

2024-07-27 13:03:13 +10:00 · 2024-07-27 13:03:13 +10:00 · 3fb690e91b
commit 3fb690e91b
parent 0c491520a8
2 changed files with 16 additions and 35 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -64,6 +64,7 @@ class Model:
    gguf_writer: gguf.GGUFWriter
    model_name: str | None
    metadata_override: Path | None
+    generated_source_uuid: str | None

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@ -257,23 +258,18 @@ class Model:

        return False

-    def generate_source_tensors_uuid(self) -> str:
+    def prepare_tensors(self):
+
        uuidv5_sha1 = hashlib.sha1()
        uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes)

-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
-                continue
-            data: np.ndarray = data_torch.to(torch.float64).squeeze().numpy()
-            uuidv5_sha1.update(data.tobytes('C'))
-
-        return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5))
-
-    def prepare_tensors(self):
        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")

        for name, data_torch in self.get_tensors():
+
+            uuidv5_data_buffer: np.ndarray = data_torch.to(torch.float64).numpy()
+            uuidv5_sha1.update(uuidv5_data_buffer.tobytes('C'))
+
            # we don't need these
            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
                continue
@ -353,6 +349,9 @@ class Model:

                self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)

+        # Upon missing source model uuid, generate uuid based on source tensor content
+        self.generated_source_uuid = str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5))
+
    def set_type(self):
        self.gguf_writer.add_type(gguf.GGUFType.MODEL)

@ -396,15 +395,12 @@ class Model:
                # output in the same directory as the model by default
                self.fname_out = self.dir_model / f"{fname_default}.gguf"

-        # Upon missing source model uuid, generate uuid based on source tensor content
-        if not vocab_only and self.metadata.source_uuid is None:
-            self.metadata.source_uuid = self.generate_source_tensors_uuid()
-            logger.info(f"generating general.source_uuid: {self.metadata.source_uuid}")
-
-        # Upon missing model uuid, generate uuid based on tensor content
-        if not vocab_only and self.metadata.uuid is None:
-            self.metadata.uuid = self.gguf_writer.generate_tensors_uuid()
-            logger.info(f"generating general.uuid: {self.metadata.uuid}")
+        if not vocab_only:
+            if self.metadata.source_uuid is not None:
+                logger.info(f"Source UUID present: {self.metadata.source_uuid}")
+            elif self.generated_source_uuid is not None:
+                logger.info(f"Source UUID missing. Using generated source uuid: {self.generated_source_uuid}")
+                self.metadata.source_uuid = self.generated_source_uuid

        self.set_type()

--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -2,8 +2,6 @@ from __future__ import annotations

 import logging
 import os
-import uuid
-import hashlib
 import shutil
 import struct
 import tempfile
@ -419,19 +417,6 @@ class GGUFWriter:

        self.state = WriterState.WEIGHTS

-    def generate_tensors_uuid(self) -> str:
-        uuidv5_sha1 = hashlib.sha1()
-        uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes)
-
-        for tensors in self.tensors:
-            # relying on the fact that Python dicts preserve insertion order (since 3.7)
-            for name, ti in tensors.items():
-                assert ti.tensor is not None
-                assert ti.tensor.nbytes == ti.nbytes
-                uuidv5_sha1.update(ti.tensor.tobytes('C'))
-
-        return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5))
-
    def write_tensors_to_file(self, *, progress: bool = False) -> None:
        self.write_ti_data_to_file()