convert-hf : get bit-exact same output as ./quantize
The quantization version was missing. * convert-hf : don't round bf16 NANs * convert-hf : save some memory with np.int16 intermediate bf16 weights * convert-hf : more closely match llama.cpp with which weights to keep in f32
This commit is contained in:
parent
3801db12d8
commit
95930da30e
3 changed files with 37 additions and 12 deletions
|
@ -10,6 +10,7 @@ from typing import Any
|
|||
GGUF_MAGIC = 0x46554747 # "GGUF"
|
||||
GGUF_VERSION = 3
|
||||
GGUF_DEFAULT_ALIGNMENT = 32
|
||||
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
||||
|
||||
#
|
||||
# metadata keys
|
||||
|
|
|
@ -350,7 +350,7 @@ class GGUFWriter:
|
|||
def add_name(self, name: str) -> None:
|
||||
self.add_string(Keys.General.NAME, name)
|
||||
|
||||
def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
|
||||
def add_quantization_version(self, quantization_version: int) -> None:
|
||||
self.add_uint32(
|
||||
Keys.General.QUANTIZATION_VERSION, quantization_version)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue