diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ff297dd69..1d8a87d2f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -215,81 +215,6 @@ class Model: raise ValueError(f"Can not map tensor {name!r}") return new_name - def set_gguf_meta_model(self): - self.gguf_writer.add_name(self.metadata.name) - - if self.metadata.author is not None: - self.gguf_writer.add_author(self.metadata.author) - if self.metadata.version is not None: - self.gguf_writer.add_version(self.metadata.version) - if self.metadata.organization is not None: - self.gguf_writer.add_organization(self.metadata.organization) - - if self.metadata.finetune is not None: - self.gguf_writer.add_finetune(self.metadata.finetune) - if self.metadata.basename is not None: - self.gguf_writer.add_basename(self.metadata.basename) - - if self.metadata.description is not None: - self.gguf_writer.add_description(self.metadata.description) - if self.metadata.quantized_by is not None: - self.gguf_writer.add_quantized_by(self.metadata.quantized_by) - - if self.metadata.parameter_class_attribute is not None: - self.gguf_writer.add_parameter_class_attribute(self.metadata.parameter_class_attribute) - - if self.metadata.license is not None: - self.gguf_writer.add_license(self.metadata.license) - if self.metadata.license_name is not None: - self.gguf_writer.add_license_name(self.metadata.license_name) - if self.metadata.license_link is not None: - self.gguf_writer.add_license_link(self.metadata.license_link) - - if self.metadata.url is not None: - self.gguf_writer.add_url(self.metadata.url) - if self.metadata.doi is not None: - self.gguf_writer.add_doi(self.metadata.doi) - if self.metadata.uuid is not None: - self.gguf_writer.add_uuid(self.metadata.uuid) - if self.metadata.repo_url is not None: - self.gguf_writer.add_repo_url(self.metadata.repo_url) - - if self.metadata.source_url is not None: - self.gguf_writer.add_source_url(self.metadata.source_url) - if self.metadata.source_doi is not None: - self.gguf_writer.add_source_doi(self.metadata.source_doi) - if self.metadata.source_uuid is not None: - self.gguf_writer.add_source_uuid(self.metadata.source_uuid) - if self.metadata.source_repo_url is not None: - self.gguf_writer.add_source_repo_url(self.metadata.source_repo_url) - - if self.metadata.base_models is not None: - self.gguf_writer.add_base_model_count(len(self.metadata.base_models)) - for key, base_model_entry in enumerate(self.metadata.base_models): - if "name" in base_model_entry: - self.gguf_writer.add_base_model_name(key, base_model_entry["name"]) - if "author" in base_model_entry: - self.gguf_writer.add_base_model_author(key, base_model_entry["author"]) - if "version" in base_model_entry: - self.gguf_writer.add_base_model_version(key, base_model_entry["version"]) - if "organization" in base_model_entry: - self.gguf_writer.add_base_model_organization(key, base_model_entry["organization"]) - if "url" in base_model_entry: - self.gguf_writer.add_base_model_url(key, base_model_entry["url"]) - if "doi" in base_model_entry: - self.gguf_writer.add_base_model_doi(key, base_model_entry["doi"]) - if "uuid" in base_model_entry: - self.gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"]) - if "repo_url" in base_model_entry: - self.gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"]) - - if self.metadata.tags is not None: - self.gguf_writer.add_tags(self.metadata.tags) - if self.metadata.languages is not None: - self.gguf_writer.add_languages(self.metadata.languages) - if self.metadata.datasets is not None: - self.gguf_writer.add_datasets(self.metadata.datasets) - def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) @@ -430,13 +355,30 @@ class Model: self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - def write(self): - self.write_tensors() + def prepare_key_value_store(self): + # Upon missing model uuid, generate uuid based on tensor content if self.metadata.uuid is None: self.metadata.uuid = self.gguf_writer.generate_tensors_uuid() - logger.info("generating general.uuid (based on tensor content) {0}".format(self.metadata.uuid)) + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + logger.info(f"{f'%-{max_name_len}s' % f'generating general.uuid'} {self.metadata.uuid}") + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + self.set_gguf_parameters() + + logger.info("Set model tokenizer") + self.set_vocab() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + def write(self): + self.write_tensors() + self.prepare_key_value_store() self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) @@ -445,6 +387,12 @@ class Model: def write_vocab(self): if len(self.gguf_writer.tensors) != 1: raise ValueError('Splitting the vocabulary is not supported') + + if self.metadata.uuid is None: + # Required tensor data least for uuid generation if in vocab_only mode + self.write_tensors() + + self.prepare_key_value_store() self.gguf_writer.write_header_to_file(self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -3703,18 +3651,6 @@ def main() -> None: print(f"{model_instance.fname_default}") # noqa: NP100 return - logger.info("Set meta model") - model_instance.set_gguf_meta_model() - - logger.info("Set model parameters") - model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL) - model_instance.set_gguf_parameters() - - logger.info("Set model tokenizer") - model_instance.set_vocab() - - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 037b6762f..f078110da 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -380,7 +380,7 @@ class GGUFWriter: assert ti.tensor.nbytes == ti.nbytes uuidv5_sha1.update(ti.tensor.tobytes('C')) - return uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5) + return str(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5)) def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_ti_data_to_file() diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index d0c26fd6a..d81807ae3 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -2,7 +2,6 @@ from __future__ import annotations import re import json -import uuid import frontmatter from pathlib import Path from typing import Optional @@ -10,6 +9,8 @@ from dataclasses import dataclass from .constants import Keys +import gguf + @dataclass class Metadata: @@ -331,3 +332,78 @@ class Metadata: metadata.parameter_class_attribute = parameter_class_attribute return metadata + + def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): + gguf_writer.add_name(self.name) + + if self.author is not None: + gguf_writer.add_author(self.author) + if self.version is not None: + gguf_writer.add_version(self.version) + if self.organization is not None: + gguf_writer.add_organization(self.organization) + + if self.finetune is not None: + gguf_writer.add_finetune(self.finetune) + if self.basename is not None: + gguf_writer.add_basename(self.basename) + + if self.description is not None: + gguf_writer.add_description(self.description) + if self.quantized_by is not None: + gguf_writer.add_quantized_by(self.quantized_by) + + if self.parameter_class_attribute is not None: + gguf_writer.add_parameter_class_attribute(self.parameter_class_attribute) + + if self.license is not None: + gguf_writer.add_license(self.license) + if self.license_name is not None: + gguf_writer.add_license_name(self.license_name) + if self.license_link is not None: + gguf_writer.add_license_link(self.license_link) + + if self.url is not None: + gguf_writer.add_url(self.url) + if self.doi is not None: + gguf_writer.add_doi(self.doi) + if self.uuid is not None: + gguf_writer.add_uuid(self.uuid) + if self.repo_url is not None: + gguf_writer.add_repo_url(self.repo_url) + + if self.source_url is not None: + gguf_writer.add_source_url(self.source_url) + if self.source_doi is not None: + gguf_writer.add_source_doi(self.source_doi) + if self.source_uuid is not None: + gguf_writer.add_source_uuid(self.source_uuid) + if self.source_repo_url is not None: + gguf_writer.add_source_repo_url(self.source_repo_url) + + if self.base_models is not None: + gguf_writer.add_base_model_count(len(self.base_models)) + for key, base_model_entry in enumerate(self.base_models): + if "name" in base_model_entry: + gguf_writer.add_base_model_name(key, base_model_entry["name"]) + if "author" in base_model_entry: + gguf_writer.add_base_model_author(key, base_model_entry["author"]) + if "version" in base_model_entry: + gguf_writer.add_base_model_version(key, base_model_entry["version"]) + if "organization" in base_model_entry: + gguf_writer.add_base_model_organization(key, base_model_entry["organization"]) + if "url" in base_model_entry: + gguf_writer.add_base_model_url(key, base_model_entry["url"]) + if "doi" in base_model_entry: + gguf_writer.add_base_model_doi(key, base_model_entry["doi"]) + if "uuid" in base_model_entry: + gguf_writer.add_base_model_uuid(key, base_model_entry["uuid"]) + if "repo_url" in base_model_entry: + gguf_writer.add_base_model_repo_url(key, base_model_entry["repo_url"]) + + if self.tags is not None: + gguf_writer.add_tags(self.tags) + if self.languages is not None: + gguf_writer.add_languages(self.languages) + if self.datasets is not None: + gguf_writer.add_datasets(self.datasets)