From dbb1b471e475631693109a74ac3f13f1dca1bb9c Mon Sep 17 00:00:00 2001 From: brian khuu Date: Fri, 24 May 2024 03:48:00 +1000 Subject: [PATCH] convert-*.py: add --get-outfile command and refactor --- convert_hf_to_gguf.py | 209 +++++++++++++++++++++++++------ examples/convert_legacy_llama.py | 82 +++++------- gguf-py/gguf/__init__.py | 1 + gguf-py/gguf/utility.py | 35 ++++++ 4 files changed, 240 insertions(+), 87 deletions(-) create mode 100644 gguf-py/gguf/utility.py diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a755b0a60..7ecedfcde 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -13,7 +13,8 @@ import sys from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast, Optional +from dataclasses import dataclass import math import numpy as np @@ -29,6 +30,42 @@ import gguf logger = logging.getLogger("hf-to-gguf") +@dataclass +class Metadata: + name: Optional[str] = None + author: Optional[str] = None + version: Optional[str] = None + url: Optional[str] = None + description: Optional[str] = None + licence: Optional[str] = None + source_url: Optional[str] = None + source_hf_repo: Optional[str] = None + + @staticmethod + def load(metadata_path: Path) -> Metadata: + if metadata_path is None or not metadata_path.exists(): + return Metadata() + + with open(metadata_path, 'r') as file: + data = json.load(file) + + # Create a new Metadata instance + metadata = Metadata() + + # Assigning values to Metadata attributes if they exist in the JSON file + # This is based on LLM_KV_NAMES mapping in llama.cpp + metadata.name = data.get("general.name") + metadata.author = data.get("general.author") + metadata.version = data.get("general.version") + metadata.url = data.get("general.url") + metadata.description = data.get("general.description") + metadata.license = data.get("general.license") + metadata.source_url = data.get("general.source.url") + metadata.source_hf_repo = data.get("general.source.huggingface.repository") + + return metadata + + ###### MODEL DEFINITIONS ###### class SentencePieceTokenTypes(IntEnum): @@ -46,13 +83,13 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]") class Model: _model_classes: dict[str, type[Model]] = {} + model_name: str dir_model: Path ftype: gguf.LlamaFileType is_big_endian: bool endianess: gguf.GGUFEndian use_temp_file: bool lazy: bool - model_name: str | None part_names: list[str] is_safetensors: bool hparams: dict[str, Any] @@ -60,12 +97,14 @@ class Model: tensor_map: gguf.TensorNameMap tensor_names: set[str] | None fname_out: Path + fname_default: Path gguf_writer: gguf.GGUFWriter + metadata: Metadata # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, metadata: Metadata, model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") @@ -84,15 +123,20 @@ class Model: self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None + self.metadata = metadata + + model_tensors = self.get_tensors() + if self.ftype == gguf.LlamaFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. - _, first_tensor = next(self.get_tensors()) + _, first_tensor = next(model_tensors) if first_tensor.dtype == torch.float16: logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_F16 else: logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + ftype_up: str = self.ftype.name.partition("_")[2].upper() ftype_lw: str = ftype_up.lower() # allow templating the file name with the output ftype, useful with the "auto" ftype @@ -100,6 +144,84 @@ class Model: self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) + # Update any missing authorship metadata with huggingface_parameters + if self.metadata is not None and self.metadata.source_hf_repo is None: + if self.hparams is not None and "_name_or_path" in self.hparams: + self.metadata.source_hf_repo = self.hparams["_name_or_path"] + + # Set model name based on latest metadata either provided or calculated from environment + def get_model_name(metadata, huggingface_parameters, dir_model, model_arch): + if metadata is not None and metadata.name is not None: + # Explicit Metadata Was Provided By User + return metadata.name + elif huggingface_parameters is not None and "_name_or_path" in huggingface_parameters: + # Hugging Face Parameters Model Name or Model Folder Name is Provided + return huggingface_parameters["_name_or_path"] + elif huggingface_parameters is not None and "model_type" in huggingface_parameters: + # Hugging Face Parameters Model Type is Provided + return huggingface_parameters["model_type"] + elif dir_model is not None and dir_model.name is not None: + # Use directory folder name + return dir_model.name + else: + return gguf.MODEL_ARCH_NAMES[model_arch] + self.model_name = get_model_name(self.metadata, self.hparams, self.dir_model, self.model_arch) + + # Extracts and converts the encoding scheme from the given file type name. e.g. 'gguf.LlamaFileType.ALL_F32' --> 'F32' + encodingScheme = self.ftype.name.partition("_")[2] + + # Get Expert Count From huggingface_parameters + expert_count = self.hparams["num_local_experts"] if "num_local_experts" in self.hparams else None + + def per_model_weight_count_estimation(tensors, expert_count): + # TODO: Ensure parameter count is accurate throughout various model type + # May currently overestimate parameter count in Mamba model because + # output weights is tied with token embeddings. + sum_weight_estimate = 0 + for name, data_torch in tensors: + # Got A Tensor + + # We don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Calculate Tensor Volume + sum_weights_in_tensor = 1 + for dim in data_torch.shape: + sum_weights_in_tensor *= dim + + # Add Tensor Volume To Running Count + sum_weight_estimate += sum_weights_in_tensor + + # Calculate weight estimate per model + per_model_weight_estimate = (sum_weight_estimate / expert_count) if (expert_count > 0) else sum_weight_estimate + + return per_model_weight_estimate + + weight_estimate = per_model_weight_count_estimation(model_tensors, expert_count) + + # Generate default filename based on model specification and available metadata + self.fname_default = gguf.naming_convention(self.model_name, self.metadata.version, expert_count, weight_estimate, encodingScheme) + + # Filename Output + if fname_out is not None: + # custom defined filename and path was provided + def fill_templated_filename(filename: str, encodingScheme: str): + # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf' + ftype_uppercase: str = encodingScheme.upper() + ftype_lowercase: str = encodingScheme.lower() + return filename.format(ftype_lowercase, + outtype=ftype_lowercase, ftype=ftype_lowercase, + OUTTYPE=ftype_uppercase, FTYPE=ftype_uppercase) + + self.fname_out = fname_out.parent / fill_templated_filename(fname_out.name, encodingScheme) + else: + # output in the same directory as the model by default + self.fname_out = dir_model.parent / self.fname_default + + # Configure GGUF Writer + self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) + @classmethod def __init_subclass__(cls): # can't use an abstract property, because overriding it without type errors @@ -185,8 +307,26 @@ class Model: raise ValueError(f"Can not map tensor {name!r}") return new_name + def set_gguf_meta_model(self): + self.gguf_writer.add_name(self.model_name) + + if self.metadata is not None: + if self.metadata.author is not None: + self.gguf_writer.add_author(self.metadata.author) + if self.metadata.version is not None: + self.gguf_writer.add_version(self.metadata.version) + if self.metadata.url is not None: + self.gguf_writer.add_url(self.metadata.url) + if self.metadata.description is not None: + self.gguf_writer.add_description(self.metadata.description) + if self.metadata.licence is not None: + self.gguf_writer.add_licence(self.metadata.licence) + if self.metadata.source_url is not None: + self.gguf_writer.add_source_url(self.metadata.source_url) + if self.metadata.source_hf_repo is not None: + self.gguf_writer.add_source_hf_repo(self.metadata.source_hf_repo) + def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: @@ -773,7 +913,6 @@ class GPTNeoXModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -829,7 +968,6 @@ class BloomModel(Model): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) @@ -906,7 +1044,6 @@ class MPTModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_block_count(block_count) @@ -945,7 +1082,6 @@ class OrionModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -958,8 +1094,6 @@ class OrionModel(Model): raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -983,7 +1117,6 @@ class BaichuanModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -995,8 +1128,6 @@ class BaichuanModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1110,7 +1241,6 @@ class XverseModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1122,8 +1252,6 @@ class XverseModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1182,7 +1310,6 @@ class FalconModel(Model): if n_head_kv is None: n_head_kv = self.hparams.get("n_head_kv", 1) # old name - self.gguf_writer.add_name("Falcon") self.gguf_writer.add_context_length(2048) # not in config.json self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1227,7 +1354,6 @@ class StarCoderModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("StarCoder") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -1262,7 +1388,6 @@ class RefactModel(Model): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("Refact") # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1317,7 +1442,6 @@ class StableLMModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1560,7 +1684,6 @@ class GrokModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_name("Grok") _experts: list[dict[str, Tensor]] | None = None @@ -1609,7 +1732,6 @@ class DbrxModel(Model): def set_gguf_parameters(self): ffn_config = self.hparams["ffn_config"] attn_config = self.hparams["attn_config"] - self.gguf_writer.add_name(self.hparams["model_type"]) self.gguf_writer.add_block_count(self.hparams["n_layers"]) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) @@ -1678,7 +1800,6 @@ class MiniCPMModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1748,7 +1869,6 @@ class QwenModel(Model): self._set_vocab_qwen() def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1839,7 +1959,6 @@ class GPT2Model(Model): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_ctx"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1882,7 +2001,6 @@ class Phi2Model(Model): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_name("Phi2") self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) @@ -2004,7 +2122,6 @@ class Phi3MiniModel(Model): orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) rope_dims = n_embd // n_head - self.gguf_writer.add_name("Phi3") self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) @@ -2061,7 +2178,6 @@ class PlamoModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name("PLaMo") self.gguf_writer.add_context_length(4096) # not in config.json self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) @@ -2106,7 +2222,6 @@ class CodeShellModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("CodeShell") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -2265,7 +2380,6 @@ class InternLM2Model(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -2433,7 +2547,6 @@ class GemmaModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2549,7 +2662,6 @@ class MambaModel(Model): # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading @@ -3505,6 +3617,14 @@ def parse_args() -> argparse.Namespace: "--no-tensor-first-split", action="store_true", help="do not add tensors to the first split (disabled by default)" ) + parser.add_argument( + "--metadata", type=Path, + help="Specify the path for a metadata file" + ) + parser.add_argument( + "--get-outfile", action="store_true", + help="get calculated default outfile name" + ) return parser.parse_args() @@ -3530,8 +3650,15 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + elif args.get_outfile: + # Avoid printing anything besides the dump output + logging.basicConfig(level=logging.WARNING) + else: + logging.basicConfig(level=logging.INFO) + metadata = Metadata.load(args.metadata) dir_model = args.model if not dir_model.is_dir(): @@ -3562,17 +3689,27 @@ def main() -> None: hparams = Model.load_hparams(dir_model) with torch.inference_mode(): + encodingScheme = ftype_map[args.outtype] + model_architecture = hparams["architectures"][0] + try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = Model.from_model_architecture(model_architecture) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, + model_instance = model_class(dir_model, encodingScheme, fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split) + if args.get_outfile: + print(f"{model_instance.fname_default}") # noqa: NP100 + return + + logger.info("Set meta model") + model_instance.set_gguf_meta_model() + logger.info("Set model parameters") model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL) model_instance.set_gguf_parameters() diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index 65c66e5ff..eee8f1fee 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -1021,35 +1021,28 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise ValueError(f"Unexpected combination of types: {name_to_type}") -def model_parameter_count(model: LazyModel) -> int: - total_model_parameters = 0 - for i, (name, lazy_tensor) in enumerate(model.items()): +def per_model_weight_count_estimation(model: LazyModel, expert_count:int) -> int: + # TODO: Ensure parameter count is accurate throughout various model type + sum_weight_estimate = 0 + for name, lazy_tensor in model.items(): + # We don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Got A Tensor sum_weights_in_tensor = 1 + + # Tensor Volume for dim in lazy_tensor.shape: sum_weights_in_tensor *= dim - total_model_parameters += sum_weights_in_tensor - return total_model_parameters + # Add Tensor Volume To Running Count + sum_weight_estimate += sum_weights_in_tensor -def model_parameter_count_rounded_notation(model_params_count: int) -> str: - if model_params_count > 1e12 : - # Trillions Of Parameters - scaled_model_params = model_params_count * 1e-12 - scale_suffix = "T" - elif model_params_count > 1e9 : - # Billions Of Parameters - scaled_model_params = model_params_count * 1e-9 - scale_suffix = "B" - elif model_params_count > 1e6 : - # Millions Of Parameters - scaled_model_params = model_params_count * 1e-6 - scale_suffix = "M" - else: - # Thousands Of Parameters - scaled_model_params = model_params_count * 1e-3 - scale_suffix = "K" + # Calculate weight estimate per model + per_model_weight_estimate = (sum_weight_estimate / expert_count) if (expert_count > 0) else sum_weight_estimate - return f"{round(scaled_model_params)}{scale_suffix}" + return per_model_weight_estimate def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: @@ -1231,34 +1224,21 @@ class VocabFactory: return vocab, special_vocab -def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str: - quantization = { +def default_convention_outfile(file_type: GGMLFileType, model_name:str, expert_count:int, model_params_count: int, metadata: Metadata) -> str: + name = metadata.name if metadata is not None and metadata.name is not None else model_name + version = metadata.version if metadata is not None and metadata.version is not None else None + + encodingScheme = { GGMLFileType.AllF32: "F32", GGMLFileType.MostlyF16: "F16", GGMLFileType.MostlyQ8_0: "Q8_0", }[file_type] - parameters = model_parameter_count_rounded_notation(model_params_count) - - expert_count = "" - if params.n_experts is not None: - expert_count = f"{params.n_experts}x" - - version = "" - if metadata is not None and metadata.version is not None: - version = f"-{metadata.version}" - - name = "ggml-model" - if metadata is not None and metadata.name is not None: - name = metadata.name - elif params.path_model is not None: - name = params.path_model.name - - return f"{name}{version}-{expert_count}{parameters}-{quantization}" + return gguf.naming_convention(name, version, expert_count, model_params_count, encodingScheme) -def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path: - default_filename = default_convention_outfile(file_type, params, model_params_count, metadata) +def default_outfile(model_paths: list[Path], file_type: GGMLFileType, model_name:str, expert_count:int, model_params_count: int, metadata: Metadata) -> Path: + default_filename = default_convention_outfile(file_type, model_name, expert_count, model_params_count, metadata) ret = model_paths[0].parent / f"{default_filename}.gguf" if ret in model_paths: logger.error( @@ -1315,10 +1295,10 @@ def main(args_in: list[str] | None = None) -> None: if args.get_outfile: model_plus = load_some_model(args.model) params = Params.load(model_plus) - model = convert_model_names(model_plus.model, params, args.skip_unknown) - model_params_count = model_parameter_count(model_plus.model) - ftype = pick_output_type(model, args.outtype) - print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100 + model = convert_model_names(model_plus.model, params, args.skip_unknown) + model_params_count = per_model_weight_count_estimation(model_plus.model, params.n_experts) + ftype = pick_output_type(model, args.outtype) + print(f"{default_convention_outfile(ftype, params.path_model.name, params.n_experts, model_params_count, metadata)}") # noqa: NP100 return if args.no_vocab and args.vocab_only: @@ -1334,8 +1314,8 @@ def main(args_in: list[str] | None = None) -> None: else: model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) - model_params_count = model_parameter_count(model_plus.model) - logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})") + model_params_count = per_model_weight_count_estimation(model_plus.model, params.n_experts) + logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count)})") if args.dump: do_dump_model(model_plus) @@ -1405,7 +1385,7 @@ def main(args_in: list[str] | None = None) -> None: model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata) + outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.path_model.name, params.n_experts, model_params_count, metadata) params.ftype = ftype logger.info(f"Writing {outfile}, format {ftype}") diff --git a/gguf-py/gguf/__init__.py b/gguf-py/gguf/__init__.py index ea5146b16..a07b8ff0d 100644 --- a/gguf-py/gguf/__init__.py +++ b/gguf-py/gguf/__init__.py @@ -5,3 +5,4 @@ from .gguf_writer import * from .quants import * from .tensor_mapping import * from .vocab import * +from .utility import * diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py new file mode 100644 index 000000000..e375b687d --- /dev/null +++ b/gguf-py/gguf/utility.py @@ -0,0 +1,35 @@ +from __future__ import annotations + + +def model_weight_count_rounded_notation(model_params_count: int) -> str: + if model_params_count > 1e15 : + # Quadrillion Of Parameters + scaled_model_params = model_params_count * 1e-15 + scale_suffix = "Q" + elif model_params_count > 1e12 : + # Trillions Of Parameters + scaled_model_params = model_params_count * 1e-12 + scale_suffix = "T" + elif model_params_count > 1e9 : + # Billions Of Parameters + scaled_model_params = model_params_count * 1e-9 + scale_suffix = "B" + elif model_params_count > 1e6 : + # Millions Of Parameters + scaled_model_params = model_params_count * 1e-6 + scale_suffix = "M" + else: + # Thousands Of Parameters + scaled_model_params = model_params_count * 1e-3 + scale_suffix = "K" + return f"{round(scaled_model_params)}{scale_suffix}" + + +def naming_convention(model_name: str, version_string:str, expert_count_int:int, model_params_count: int, encodingScheme: str) -> str: + # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention + name = model_name.strip().replace(' ', '-') if model_name is not None else "ggml-model" + version = f"-{version_string}" if version_string is not None else "" + expert_count_chunk = f"{expert_count_int}x" if expert_count_int is not None and expert_count_int > 0 else "" + parameters = model_weight_count_rounded_notation(model_params_count) + encodingScheme = encodingScheme.upper() + return f"{name}{version}-{expert_count_chunk}{parameters}-{encodingScheme}"