gguf-py : remove LlamaFileTypeMap

Too specific to 'llama.cpp', and would be a maintenance burden to keep up to date. * gguf-py : add generic quantize and dequantize functions The quant classes no longer need to be known, only the target or the source type, for 'quantize' and 'dequantize', respectively.
2024-08-03 21:22:37 -04:00 · 2024-08-03 21:22:37 -04:00 · 229c35cb59
commit 229c35cb59
parent e82ff5a346
4 changed files with 54 additions and 58 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -301,20 +301,25 @@ class Model:
                ):
                    data_qtype = gguf.GGMLQuantizationType.F32
                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                if isinstance(data_qtype, bool):
-                    data_qtype = gguf.LlamaFileTypeMap.get(self.ftype, gguf.GGMLQuantizationType.F32)
+                    if self.ftype == gguf.LlamaFileType.ALL_F32:
-
+                        data_qtype = gguf.GGMLQuantizationType.F32
-                if data_qtype == gguf.GGMLQuantizationType.Q8_0:
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
                    if gguf.quants.Q8_0.can_quantize(data):
                        data = gguf.quants.Q8_0.quantize(data)
                    else:  # fallback to f16
                        data_qtype = gguf.GGMLQuantizationType.F16
-                if data_qtype == gguf.GGMLQuantizationType.BF16:
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
-                    data = gguf.quants.BF16.quantize(data)
+                        data_qtype = gguf.GGMLQuantizationType.BF16
-                if data_qtype == gguf.GGMLQuantizationType.F16:
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
-                    data = data.astype(np.float16, copy=False)
+                        data_qtype = gguf.GGMLQuantizationType.Q8_0
-                if data_qtype == gguf.GGMLQuantizationType.F32:
+                    else:
-                    data = data.astype(np.float32, copy=False)
+                        raise ValueError(f"Unknown file type: {self.ftype.name}")
                try:
                    data = gguf.quants.quantize(data, data_qtype)
                except gguf.QuantError as e:
                    logger.warning("%s, %s", e, "falling back to F16")
                    data_qtype = gguf.GGMLQuantizationType.F16
                    data = gguf.quants.quantize(data, data_qtype)
                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -1196,47 +1196,6 @@ class LlamaFileType(IntEnum):
    GUESSED              = 1024  # not specified in the model file
 # Default quantization type for each file type
 # Keep this the same as in llama_model_quantize_internal from llama.cpp
 LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
    LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
    LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
    LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
    LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
    LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
    LlamaFileType.MOSTLY_F16:  GGMLQuantizationType.F16,
    LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
    LlamaFileType.ALL_F32:     GGMLQuantizationType.F32,
    # K-quants
    LlamaFileType.MOSTLY_Q2_K_S:  GGMLQuantizationType.Q2_K,
    LlamaFileType.MOSTLY_Q2_K:    GGMLQuantizationType.Q2_K,
    LlamaFileType.MOSTLY_IQ3_XS:  GGMLQuantizationType.IQ3_S,
    LlamaFileType.MOSTLY_Q3_K_S:  GGMLQuantizationType.Q3_K,
    LlamaFileType.MOSTLY_Q3_K_M:  GGMLQuantizationType.Q3_K,
    LlamaFileType.MOSTLY_Q3_K_L:  GGMLQuantizationType.Q3_K,
    LlamaFileType.MOSTLY_Q4_K_S:  GGMLQuantizationType.Q4_K,
    LlamaFileType.MOSTLY_Q4_K_M:  GGMLQuantizationType.Q4_K,
    LlamaFileType.MOSTLY_Q5_K_S:  GGMLQuantizationType.Q5_K,
    LlamaFileType.MOSTLY_Q5_K_M:  GGMLQuantizationType.Q5_K,
    LlamaFileType.MOSTLY_Q6_K:    GGMLQuantizationType.Q6_K,
    LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
    LlamaFileType.MOSTLY_IQ2_XS:  GGMLQuantizationType.IQ2_XS,
    LlamaFileType.MOSTLY_IQ2_S:   GGMLQuantizationType.IQ2_XS,
    LlamaFileType.MOSTLY_IQ2_M:   GGMLQuantizationType.IQ2_S,
    LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
    LlamaFileType.MOSTLY_IQ1_S:   GGMLQuantizationType.IQ1_S,
    LlamaFileType.MOSTLY_IQ1_M:   GGMLQuantizationType.IQ1_M,
    LlamaFileType.MOSTLY_IQ4_NL:  GGMLQuantizationType.IQ4_NL,
    LlamaFileType.MOSTLY_IQ4_XS:  GGMLQuantizationType.IQ4_XS,
    LlamaFileType.MOSTLY_IQ3_S:   GGMLQuantizationType.IQ3_S,
    LlamaFileType.MOSTLY_IQ3_M:   GGMLQuantizationType.IQ3_S,
    LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
    LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
    LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
 }
 class GGUFEndian(IntEnum):
    LITTLE = 0
    BIG = 1
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
 class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray
    shape: tuple[int, ...]  # Makes the type checker happy in quants.py
    @classmethod
    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@ -46,6 +46,32 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
    return np.sign(n) * b
 class QuantError(Exception): ...
 _type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
 def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
    if qtype == GGMLQuantizationType.F32:
        return data.astype(np.float32, copy=False)
    elif qtype == GGMLQuantizationType.F16:
        return data.astype(np.float16, copy=False)
    elif (q := _type_traits.get(qtype)) is not None:
        return q.quantize(data)
    else:
        raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
 def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
    if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
        return data.astype(np.float32, copy=False)
    elif (q := _type_traits.get(qtype)) is not None:
        return q.dequantize(data)
    else:
        raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
 class __Quant(ABC):
    qtype: GGMLQuantizationType
    block_size: int
@ -65,6 +91,8 @@ class __Quant(ABC):
            cls.__dequantize_array,
            meta_noop=(np.float32, cls.__shape_from_bytes)
        )
        assert qtype not in _type_traits
        _type_traits[qtype] = cls
    @classmethod
    @abstractmethod
@ -115,26 +143,28 @@ class __Quant(ABC):
        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
    @classmethod
-    def __quantize_lazy(cls, lazy_tensor: Any, /) -> Any:
+    def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
        pass
    @classmethod
-    def __dequantize_lazy(cls, lazy_tensor: Any, /) -> Any:
+    def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
        pass
    @classmethod
-    def can_quantize(cls, tensor: np.ndarray) -> bool:
+    def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
        return tensor.shape[-1] % cls.block_size == 0
    @classmethod
-    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor):
+    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
        if not cls.can_quantize(tensor):
            raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
        if isinstance(tensor, LazyNumpyTensor):
            return cls.__quantize_lazy(tensor)
        else:
            return cls.__quantize_array(tensor)
    @classmethod
-    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor):
+    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
        if isinstance(tensor, LazyNumpyTensor):
            return cls.__dequantize_lazy(tensor)
        else: