diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 36430b40a..ff297dd69 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -432,6 +432,11 @@ class Model: def write(self): self.write_tensors() + + if self.metadata.uuid is None: + self.metadata.uuid = self.gguf_writer.generate_tensors_uuid() + logger.info("generating general.uuid (based on tensor content) {0}".format(self.metadata.uuid)) + self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 0f94f2dde..037b6762f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -2,6 +2,8 @@ from __future__ import annotations import logging import os +import uuid +import hashlib import shutil import struct import tempfile @@ -115,6 +117,7 @@ class GGUFWriter: if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path): # allow calling this multiple times as long as the path is the same return + if self.state is not WriterState.NO_FILE: raise ValueError(f'Expected output file to be not yet opened, got {self.state}') @@ -366,6 +369,19 @@ class GGUFWriter: self.state = WriterState.WEIGHTS + def generate_tensors_uuid(self) -> None: + uuidv5_sha1 = hashlib.sha1() + uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes) + + for tensors in self.tensors: + # relying on the fact that Python dicts preserve insertion order (since 3.7) + for name, ti in tensors.items(): + assert ti.tensor is not None + assert ti.tensor.nbytes == ti.nbytes + uuidv5_sha1.update(ti.tensor.tobytes('C')) + + return uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5) + def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_ti_data_to_file() diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index 6ae4e044c..d0c26fd6a 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -95,9 +95,6 @@ class Metadata: if model_name is not None: metadata.name = model_name - # If any UUID is still missing at this point, then we should fill it in - metadata = Metadata.generate_any_missing_uuid(metadata) - return metadata @staticmethod @@ -334,55 +331,3 @@ class Metadata: metadata.parameter_class_attribute = parameter_class_attribute return metadata - - @staticmethod - def generate_any_missing_uuid(metadata: Metadata) -> Metadata: - - # UUID Generation if not already provided - if metadata.uuid is None: - # Generate UUID based on provided links/id. UUIDv4 used as fallback - new_uuid = None - - if metadata.doi is not None: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, f"https://doi.org/{metadata.doi}") - elif metadata.repo_url is not None: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.repo_url) - elif metadata.url is not None: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.url) - else: - new_uuid = uuid.uuid4() # every model must have at least a random UUIDv4 - - if new_uuid is not None: - metadata.uuid = str(new_uuid) - - if metadata.source_uuid is None: - # Generate a UUID based on provided links/id only if source provided - new_uuid = None - - if metadata.source_doi is not None: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, f"https://doi.org/{metadata.source_doi}") - elif metadata.source_repo_url is not None: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.source_repo_url) - elif metadata.source_url is not None: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.source_url) - - if new_uuid is not None: - metadata.source_uuid = str(new_uuid) - - if metadata.base_models is not None: - for model_entry in metadata.base_models: - if "uuid" not in model_entry: - # Generate a UUID based on provided links/id only if source provided - new_uuid = None - - if "repo_url" in model_entry: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, model_entry["repo_url"]) - elif "url" in model_entry: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, model_entry["url"]) - elif "doi" in model_entry: - new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, model_entry["doi"]) - - if new_uuid is not None: - model_entry["uuid"] = str(new_uuid) - - return metadata