gguf_writer.py: generate tensor uuid if missing

This commit is contained in:
brian khuu 2024-07-09 06:52:44 +10:00
parent 4dc8ddd35a
commit 007708e32d
3 changed files with 21 additions and 55 deletions

View file

@ -432,6 +432,11 @@ class Model:
def write(self):
self.write_tensors()
if self.metadata.uuid is None:
self.metadata.uuid = self.gguf_writer.generate_tensors_uuid()
logger.info("generating general.uuid (based on tensor content) {0}".format(self.metadata.uuid))
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True)

View file

@ -2,6 +2,8 @@ from __future__ import annotations
import logging
import os
import uuid
import hashlib
import shutil
import struct
import tempfile
@ -115,6 +117,7 @@ class GGUFWriter:
if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
# allow calling this multiple times as long as the path is the same
return
if self.state is not WriterState.NO_FILE:
raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
@ -366,6 +369,19 @@ class GGUFWriter:
self.state = WriterState.WEIGHTS
def generate_tensors_uuid(self) -> None:
uuidv5_sha1 = hashlib.sha1()
uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes)
for tensors in self.tensors:
# relying on the fact that Python dicts preserve insertion order (since 3.7)
for name, ti in tensors.items():
assert ti.tensor is not None
assert ti.tensor.nbytes == ti.nbytes
uuidv5_sha1.update(ti.tensor.tobytes('C'))
return uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)
def write_tensors_to_file(self, *, progress: bool = False) -> None:
self.write_ti_data_to_file()

View file

@ -95,9 +95,6 @@ class Metadata:
if model_name is not None:
metadata.name = model_name
# If any UUID is still missing at this point, then we should fill it in
metadata = Metadata.generate_any_missing_uuid(metadata)
return metadata
@staticmethod
@ -334,55 +331,3 @@ class Metadata:
metadata.parameter_class_attribute = parameter_class_attribute
return metadata
@staticmethod
def generate_any_missing_uuid(metadata: Metadata) -> Metadata:
# UUID Generation if not already provided
if metadata.uuid is None:
# Generate UUID based on provided links/id. UUIDv4 used as fallback
new_uuid = None
if metadata.doi is not None:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, f"https://doi.org/{metadata.doi}")
elif metadata.repo_url is not None:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.repo_url)
elif metadata.url is not None:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.url)
else:
new_uuid = uuid.uuid4() # every model must have at least a random UUIDv4
if new_uuid is not None:
metadata.uuid = str(new_uuid)
if metadata.source_uuid is None:
# Generate a UUID based on provided links/id only if source provided
new_uuid = None
if metadata.source_doi is not None:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, f"https://doi.org/{metadata.source_doi}")
elif metadata.source_repo_url is not None:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.source_repo_url)
elif metadata.source_url is not None:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, metadata.source_url)
if new_uuid is not None:
metadata.source_uuid = str(new_uuid)
if metadata.base_models is not None:
for model_entry in metadata.base_models:
if "uuid" not in model_entry:
# Generate a UUID based on provided links/id only if source provided
new_uuid = None
if "repo_url" in model_entry:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, model_entry["repo_url"])
elif "url" in model_entry:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, model_entry["url"])
elif "doi" in model_entry:
new_uuid = uuid.uuid5(uuid.NAMESPACE_URL, model_entry["doi"])
if new_uuid is not None:
model_entry["uuid"] = str(new_uuid)
return metadata