convert-hf : get bit-exact same output as ./quantize

The quantization version was missing.

* convert-hf : don't round bf16 NANs

* convert-hf : save some memory with np.int16 intermediate bf16 weights

* convert-hf : more closely match llama.cpp with which weights to keep in f32
This commit is contained in:
Francis Couture-Harpin 2024-05-09 11:27:34 -04:00
parent 3801db12d8
commit 95930da30e
3 changed files with 37 additions and 12 deletions

View file

@ -142,14 +142,27 @@ class Model:
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
name: str = gguf.TENSOR_NAMES[key]
if key not in gguf.MODEL_TENSORS[self.model_arch]: if key not in gguf.MODEL_TENSORS[self.model_arch]:
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
name: str = gguf.TENSOR_NAMES[key]
if "{bid}" in name: if "{bid}" in name:
assert bid is not None assert bid is not None
name = name.format(bid=bid) name = name.format(bid=bid)
return name + suffix return name + suffix
def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
if key not in gguf.MODEL_TENSORS[self.model_arch]:
return False
key_name: str = gguf.TENSOR_NAMES[key]
if "{bid}" in key_name:
if bid is None:
return False
key_name = key_name.format(bid=bid)
else:
if bid is not None:
return False
return name == (key_name + suffix)
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
if new_name is None: if new_name is None:
@ -218,12 +231,12 @@ class Model:
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def np_fp32_to_bf16(n: np.ndarray): def np_fp32_to_bf16(n: np.ndarray):
# force nan to quiet # force nan to quiet
n = np.where((n & 0x7fffffff) > 0x7f800000, n | (64 << 16), n) n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
# flush subnormals to zero # flush subnormals to zero
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n) n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
# round to nearest even # round to nearest even
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16 n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
return n return n.astype(np.int16)
# Doing this row-wise is much, much faster than element-wise, hence the signature # Doing this row-wise is much, much faster than element-wise, hence the signature
v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)") v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)")
@ -263,10 +276,25 @@ class Model:
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight") # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
extra_f32 = any(cond for cond in (
extra_f32,
n_dims == 1,
new_name.endswith("_norm.weight"),
))
# Some tensor types are always in float32
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
gguf.MODEL_TENSOR.FFN_GATE_INP,
gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
))
# if f16 desired, convert any float32 2-dim weight tensors to float16 # if f16 desired, convert any float32 2-dim weight tensors to float16
extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2) extra_f16 = any(cond for cond in (
extra_f16,
(name.endswith(".weight") and n_dims >= 2),
))
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
if self.ftype == gguf.LlamaFileType.MOSTLY_F16: if self.ftype == gguf.LlamaFileType.MOSTLY_F16:
@ -2050,12 +2078,6 @@ class BertModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
del new_name, bid, n_dims # unused
# not used with get_rows, must be F32
return name == "embeddings.token_type_embeddings.weight"
@Model.register("NomicBertModel") @Model.register("NomicBertModel")
class NomicBertModel(BertModel): class NomicBertModel(BertModel):
@ -2453,6 +2475,8 @@ def main() -> None:
logger.info("Set model tokenizer") logger.info("Set model tokenizer")
model_instance.set_vocab() model_instance.set_vocab()
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION);
if args.vocab_only: if args.vocab_only:
logger.info(f"Exporting model vocab to '{fname_out}'") logger.info(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab() model_instance.write_vocab()

View file

@ -10,6 +10,7 @@ from typing import Any
GGUF_MAGIC = 0x46554747 # "GGUF" GGUF_MAGIC = 0x46554747 # "GGUF"
GGUF_VERSION = 3 GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32 GGUF_DEFAULT_ALIGNMENT = 32
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
# #
# metadata keys # metadata keys

View file

@ -350,7 +350,7 @@ class GGUFWriter:
def add_name(self, name: str) -> None: def add_name(self, name: str) -> None:
self.add_string(Keys.General.NAME, name) self.add_string(Keys.General.NAME, name)
def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None: def add_quantization_version(self, quantization_version: int) -> None:
self.add_uint32( self.add_uint32(
Keys.General.QUANTIZATION_VERSION, quantization_version) Keys.General.QUANTIZATION_VERSION, quantization_version)