From 95930da30ec55d35565617203af249cecc83ff90 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 9 May 2024 11:27:34 -0400 Subject: [PATCH] convert-hf : get bit-exact same output as ./quantize The quantization version was missing. * convert-hf : don't round bf16 NANs * convert-hf : save some memory with np.int16 intermediate bf16 weights * convert-hf : more closely match llama.cpp with which weights to keep in f32 --- convert-hf-to-gguf.py | 46 ++++++++++++++++++++++++++++--------- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 2 +- 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a1b37048f..868d5026c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -142,14 +142,27 @@ class Model: raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: - name: str = gguf.TENSOR_NAMES[key] if key not in gguf.MODEL_TENSORS[self.model_arch]: raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + name: str = gguf.TENSOR_NAMES[key] if "{bid}" in name: assert bid is not None name = name.format(bid=bid) return name + suffix + def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + return False + key_name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in key_name: + if bid is None: + return False + key_name = key_name.format(bid=bid) + else: + if bid is not None: + return False + return name == (key_name + suffix) + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) if new_name is None: @@ -218,12 +231,12 @@ class Model: # same as ggml_compute_fp32_to_bf16 in ggml-impl.h def np_fp32_to_bf16(n: np.ndarray): # force nan to quiet - n = np.where((n & 0x7fffffff) > 0x7f800000, n | (64 << 16), n) + n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n) # flush subnormals to zero n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n) # round to nearest even n = (n + (0x7fff + ((n >> 16) & 1))) >> 16 - return n + return n.astype(np.int16) # Doing this row-wise is much, much faster than element-wise, hence the signature v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)") @@ -263,10 +276,25 @@ class Model: extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight") + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + extra_f32 = any(cond for cond in ( + extra_f32, + n_dims == 1, + new_name.endswith("_norm.weight"), + )) + + # Some tensor types are always in float32 + extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + )) # if f16 desired, convert any float32 2-dim weight tensors to float16 - extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2) + extra_f16 = any(cond for cond in ( + extra_f16, + (name.endswith(".weight") and n_dims >= 2), + )) if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: if self.ftype == gguf.LlamaFileType.MOSTLY_F16: @@ -2050,12 +2078,6 @@ class BertModel(Model): return [(self.map_tensor_name(name), data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del new_name, bid, n_dims # unused - - # not used with get_rows, must be F32 - return name == "embeddings.token_type_embeddings.weight" - @Model.register("NomicBertModel") class NomicBertModel(BertModel): @@ -2453,6 +2475,8 @@ def main() -> None: logger.info("Set model tokenizer") model_instance.set_vocab() + model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION); + if args.vocab_only: logger.info(f"Exporting model vocab to '{fname_out}'") model_instance.write_vocab() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index d35835205..c29409c9b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -10,6 +10,7 @@ from typing import Any GGUF_MAGIC = 0x46554747 # "GGUF" GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 +GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h # # metadata keys diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 5fca09e73..96574358d 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -350,7 +350,7 @@ class GGUFWriter: def add_name(self, name: str) -> None: self.add_string(Keys.General.NAME, name) - def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None: + def add_quantization_version(self, quantization_version: int) -> None: self.add_uint32( Keys.General.QUANTIZATION_VERSION, quantization_version)