From 0c491520a87ee74146513abb1af609b28f1997f7 Mon Sep 17 00:00:00 2001 From: brian khuu Date: Sat, 27 Jul 2024 02:25:39 +1000 Subject: [PATCH] convert-*.py: Add source uuid generation --- convert_hf_to_gguf.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cc3dc6a2e..dc56621a5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10,6 +10,8 @@ import json import os import re import sys +import uuid +import hashlib from enum import IntEnum from pathlib import Path from hashlib import sha256 @@ -255,6 +257,19 @@ class Model: return False + def generate_source_tensors_uuid(self) -> str: + uuidv5_sha1 = hashlib.sha1() + uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes) + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + data: np.ndarray = data_torch.to(torch.float64).squeeze().numpy() + uuidv5_sha1.update(data.tobytes('C')) + + return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)) + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") @@ -381,11 +396,15 @@ class Model: # output in the same directory as the model by default self.fname_out = self.dir_model / f"{fname_default}.gguf" + # Upon missing source model uuid, generate uuid based on source tensor content + if not vocab_only and self.metadata.source_uuid is None: + self.metadata.source_uuid = self.generate_source_tensors_uuid() + logger.info(f"generating general.source_uuid: {self.metadata.source_uuid}") + # Upon missing model uuid, generate uuid based on tensor content if not vocab_only and self.metadata.uuid is None: self.metadata.uuid = self.gguf_writer.generate_tensors_uuid() - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") - logger.info(f"{f'%-{max_name_len}s' % f'generating general.uuid'} {self.metadata.uuid}") + logger.info(f"generating general.uuid: {self.metadata.uuid}") self.set_type() @@ -3468,6 +3487,7 @@ class LazyTorchTensor(gguf.LazyBase): _dtype_map: dict[torch.dtype, type] = { torch.float16: np.float16, torch.float32: np.float32, + torch.float64: np.float64, } # used for safetensors slices