convert-*.py: Add source uuid generation

This commit is contained in:
brian khuu 2024-07-27 02:25:39 +10:00
parent be8306d795
commit 0c491520a8

View file

@ -10,6 +10,8 @@ import json
import os import os
import re import re
import sys import sys
import uuid
import hashlib
from enum import IntEnum from enum import IntEnum
from pathlib import Path from pathlib import Path
from hashlib import sha256 from hashlib import sha256
@ -255,6 +257,19 @@ class Model:
return False return False
def generate_source_tensors_uuid(self) -> str:
uuidv5_sha1 = hashlib.sha1()
uuidv5_sha1.update(uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5').bytes)
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue
data: np.ndarray = data_torch.to(torch.float64).squeeze().numpy()
uuidv5_sha1.update(data.tobytes('C'))
return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5))
def prepare_tensors(self): def prepare_tensors(self):
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
@ -381,11 +396,15 @@ class Model:
# output in the same directory as the model by default # output in the same directory as the model by default
self.fname_out = self.dir_model / f"{fname_default}.gguf" self.fname_out = self.dir_model / f"{fname_default}.gguf"
# Upon missing source model uuid, generate uuid based on source tensor content
if not vocab_only and self.metadata.source_uuid is None:
self.metadata.source_uuid = self.generate_source_tensors_uuid()
logger.info(f"generating general.source_uuid: {self.metadata.source_uuid}")
# Upon missing model uuid, generate uuid based on tensor content # Upon missing model uuid, generate uuid based on tensor content
if not vocab_only and self.metadata.uuid is None: if not vocab_only and self.metadata.uuid is None:
self.metadata.uuid = self.gguf_writer.generate_tensors_uuid() self.metadata.uuid = self.gguf_writer.generate_tensors_uuid()
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") logger.info(f"generating general.uuid: {self.metadata.uuid}")
logger.info(f"{f'%-{max_name_len}s' % f'generating general.uuid'} {self.metadata.uuid}")
self.set_type() self.set_type()
@ -3468,6 +3487,7 @@ class LazyTorchTensor(gguf.LazyBase):
_dtype_map: dict[torch.dtype, type] = { _dtype_map: dict[torch.dtype, type] = {
torch.float16: np.float16, torch.float16: np.float16,
torch.float32: np.float32, torch.float32: np.float32,
torch.float64: np.float64,
} }
# used for safetensors slices # used for safetensors slices