From 229c35cb590b436947ac221a7ce5cb8da2b0eb2d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sat, 3 Aug 2024 21:22:37 -0400 Subject: [PATCH] gguf-py : remove LlamaFileTypeMap Too specific to 'llama.cpp', and would be a maintenance burden to keep up to date. * gguf-py : add generic quantize and dequantize functions The quant classes no longer need to be known, only the target or the source type, for 'quantize' and 'dequantize', respectively. --- convert_hf_to_gguf.py | 29 +++++++++++++++------------ gguf-py/gguf/constants.py | 41 --------------------------------------- gguf-py/gguf/lazy.py | 2 ++ gguf-py/gguf/quants.py | 40 +++++++++++++++++++++++++++++++++----- 4 files changed, 54 insertions(+), 58 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bfdf29a64..dac5baa69 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -301,20 +301,25 @@ class Model: ): data_qtype = gguf.GGMLQuantizationType.F32 + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) if isinstance(data_qtype, bool): - data_qtype = gguf.LlamaFileTypeMap.get(self.ftype, gguf.GGMLQuantizationType.F32) - - if data_qtype == gguf.GGMLQuantizationType.Q8_0: - if gguf.quants.Q8_0.can_quantize(data): - data = gguf.quants.Q8_0.quantize(data) - else: # fallback to f16 + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: data_qtype = gguf.GGMLQuantizationType.F16 - if data_qtype == gguf.GGMLQuantizationType.BF16: - data = gguf.quants.BF16.quantize(data) - if data_qtype == gguf.GGMLQuantizationType.F16: - data = data.astype(np.float16, copy=False) - if data_qtype == gguf.GGMLQuantizationType.F32: - data = data.astype(np.float32, copy=False) + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") + + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index af7408be9..cad99a6b1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1196,47 +1196,6 @@ class LlamaFileType(IntEnum): GUESSED = 1024 # not specified in the model file -# Default quantization type for each file type -# Keep this the same as in llama_model_quantize_internal from llama.cpp -LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = { - LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0, - LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1, - LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0, - LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1, - LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0, - LlamaFileType.MOSTLY_F16: GGMLQuantizationType.F16, - LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16, - LlamaFileType.ALL_F32: GGMLQuantizationType.F32, - - # K-quants - LlamaFileType.MOSTLY_Q2_K_S: GGMLQuantizationType.Q2_K, - LlamaFileType.MOSTLY_Q2_K: GGMLQuantizationType.Q2_K, - LlamaFileType.MOSTLY_IQ3_XS: GGMLQuantizationType.IQ3_S, - LlamaFileType.MOSTLY_Q3_K_S: GGMLQuantizationType.Q3_K, - LlamaFileType.MOSTLY_Q3_K_M: GGMLQuantizationType.Q3_K, - LlamaFileType.MOSTLY_Q3_K_L: GGMLQuantizationType.Q3_K, - LlamaFileType.MOSTLY_Q4_K_S: GGMLQuantizationType.Q4_K, - LlamaFileType.MOSTLY_Q4_K_M: GGMLQuantizationType.Q4_K, - LlamaFileType.MOSTLY_Q5_K_S: GGMLQuantizationType.Q5_K, - LlamaFileType.MOSTLY_Q5_K_M: GGMLQuantizationType.Q5_K, - LlamaFileType.MOSTLY_Q6_K: GGMLQuantizationType.Q6_K, - LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS, - LlamaFileType.MOSTLY_IQ2_XS: GGMLQuantizationType.IQ2_XS, - LlamaFileType.MOSTLY_IQ2_S: GGMLQuantizationType.IQ2_XS, - LlamaFileType.MOSTLY_IQ2_M: GGMLQuantizationType.IQ2_S, - LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS, - LlamaFileType.MOSTLY_IQ1_S: GGMLQuantizationType.IQ1_S, - LlamaFileType.MOSTLY_IQ1_M: GGMLQuantizationType.IQ1_M, - LlamaFileType.MOSTLY_IQ4_NL: GGMLQuantizationType.IQ4_NL, - LlamaFileType.MOSTLY_IQ4_XS: GGMLQuantizationType.IQ4_XS, - LlamaFileType.MOSTLY_IQ3_S: GGMLQuantizationType.IQ3_S, - LlamaFileType.MOSTLY_IQ3_M: GGMLQuantizationType.IQ3_S, - LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4, - LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8, - LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8, -} - - class GGUFEndian(IntEnum): LITTLE = 0 BIG = 1 diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index ac98d9a92..8d4fece2d 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta): class LazyNumpyTensor(LazyBase): _tensor_type = np.ndarray + shape: tuple[int, ...] # Makes the type checker happy in quants.py + @classmethod def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]: # The initial idea was to use np.nan as the fill value, diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index a5a78a7bc..a443dd27e 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -46,6 +46,32 @@ def np_roundf(n: np.ndarray) -> np.ndarray: return np.sign(n) * b +class QuantError(Exception): ... + + +_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {} + + +def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32: + return data.astype(np.float32, copy=False) + elif qtype == GGMLQuantizationType.F16: + return data.astype(np.float16, copy=False) + elif (q := _type_traits.get(qtype)) is not None: + return q.quantize(data) + else: + raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented") + + +def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16: + return data.astype(np.float32, copy=False) + elif (q := _type_traits.get(qtype)) is not None: + return q.dequantize(data) + else: + raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented") + + class __Quant(ABC): qtype: GGMLQuantizationType block_size: int @@ -65,6 +91,8 @@ class __Quant(ABC): cls.__dequantize_array, meta_noop=(np.float32, cls.__shape_from_bytes) ) + assert qtype not in _type_traits + _type_traits[qtype] = cls @classmethod @abstractmethod @@ -115,26 +143,28 @@ class __Quant(ABC): return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape)) @classmethod - def __quantize_lazy(cls, lazy_tensor: Any, /) -> Any: + def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: pass @classmethod - def __dequantize_lazy(cls, lazy_tensor: Any, /) -> Any: + def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: pass @classmethod - def can_quantize(cls, tensor: np.ndarray) -> bool: + def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool: return tensor.shape[-1] % cls.block_size == 0 @classmethod - def quantize(cls, tensor: np.ndarray | LazyNumpyTensor): + def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: + if not cls.can_quantize(tensor): + raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}") if isinstance(tensor, LazyNumpyTensor): return cls.__quantize_lazy(tensor) else: return cls.__quantize_array(tensor) @classmethod - def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor): + def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: if isinstance(tensor, LazyNumpyTensor): return cls.__dequantize_lazy(tensor) else: