gguf-py : remove LlamaFileTypeMap

Too specific to 'llama.cpp', and would be a maintenance burden to keep up to date. * gguf-py : add generic quantize and dequantize functions The quant classes no longer need to be known, only the target or the source type, for 'quantize' and 'dequantize', respectively.
2024-08-03 21:22:37 -04:00 · 2024-08-03 21:22:37 -04:00 · 229c35cb59
commit 229c35cb59
parent e82ff5a346
4 changed files with 54 additions and 58 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -1196,47 +1196,6 @@ class LlamaFileType(IntEnum):
    GUESSED              = 1024  # not specified in the model file


-# Default quantization type for each file type
-# Keep this the same as in llama_model_quantize_internal from llama.cpp
-LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
-    LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
-    LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
-    LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
-    LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
-    LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
-    LlamaFileType.MOSTLY_F16:  GGMLQuantizationType.F16,
-    LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
-    LlamaFileType.ALL_F32:     GGMLQuantizationType.F32,
-
-    # K-quants
-    LlamaFileType.MOSTLY_Q2_K_S:  GGMLQuantizationType.Q2_K,
-    LlamaFileType.MOSTLY_Q2_K:    GGMLQuantizationType.Q2_K,
-    LlamaFileType.MOSTLY_IQ3_XS:  GGMLQuantizationType.IQ3_S,
-    LlamaFileType.MOSTLY_Q3_K_S:  GGMLQuantizationType.Q3_K,
-    LlamaFileType.MOSTLY_Q3_K_M:  GGMLQuantizationType.Q3_K,
-    LlamaFileType.MOSTLY_Q3_K_L:  GGMLQuantizationType.Q3_K,
-    LlamaFileType.MOSTLY_Q4_K_S:  GGMLQuantizationType.Q4_K,
-    LlamaFileType.MOSTLY_Q4_K_M:  GGMLQuantizationType.Q4_K,
-    LlamaFileType.MOSTLY_Q5_K_S:  GGMLQuantizationType.Q5_K,
-    LlamaFileType.MOSTLY_Q5_K_M:  GGMLQuantizationType.Q5_K,
-    LlamaFileType.MOSTLY_Q6_K:    GGMLQuantizationType.Q6_K,
-    LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
-    LlamaFileType.MOSTLY_IQ2_XS:  GGMLQuantizationType.IQ2_XS,
-    LlamaFileType.MOSTLY_IQ2_S:   GGMLQuantizationType.IQ2_XS,
-    LlamaFileType.MOSTLY_IQ2_M:   GGMLQuantizationType.IQ2_S,
-    LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
-    LlamaFileType.MOSTLY_IQ1_S:   GGMLQuantizationType.IQ1_S,
-    LlamaFileType.MOSTLY_IQ1_M:   GGMLQuantizationType.IQ1_M,
-    LlamaFileType.MOSTLY_IQ4_NL:  GGMLQuantizationType.IQ4_NL,
-    LlamaFileType.MOSTLY_IQ4_XS:  GGMLQuantizationType.IQ4_XS,
-    LlamaFileType.MOSTLY_IQ3_S:   GGMLQuantizationType.IQ3_S,
-    LlamaFileType.MOSTLY_IQ3_M:   GGMLQuantizationType.IQ3_S,
-    LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
-    LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
-    LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
-}
-
-
 class GGUFEndian(IntEnum):
    LITTLE = 0
    BIG = 1
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
 class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray

+    shape: tuple[int, ...]  # Makes the type checker happy in quants.py
+
    @classmethod
    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@ -46,6 +46,32 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
    return np.sign(n) * b


+class QuantError(Exception): ...
+
+
+_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
+
+
+def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32:
+        return data.astype(np.float32, copy=False)
+    elif qtype == GGMLQuantizationType.F16:
+        return data.astype(np.float16, copy=False)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.quantize(data)
+    else:
+        raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
+
+
+def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
+        return data.astype(np.float32, copy=False)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.dequantize(data)
+    else:
+        raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
+
+
 class __Quant(ABC):
    qtype: GGMLQuantizationType
    block_size: int
@ -65,6 +91,8 @@ class __Quant(ABC):
            cls.__dequantize_array,
            meta_noop=(np.float32, cls.__shape_from_bytes)
        )
+        assert qtype not in _type_traits
+        _type_traits[qtype] = cls

    @classmethod
    @abstractmethod
@ -115,26 +143,28 @@ class __Quant(ABC):
        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))

    @classmethod
-    def __quantize_lazy(cls, lazy_tensor: Any, /) -> Any:
+    def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
        pass

    @classmethod
-    def __dequantize_lazy(cls, lazy_tensor: Any, /) -> Any:
+    def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
        pass

    @classmethod
-    def can_quantize(cls, tensor: np.ndarray) -> bool:
+    def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
        return tensor.shape[-1] % cls.block_size == 0

    @classmethod
-    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor):
+    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
+        if not cls.can_quantize(tensor):
+            raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
        if isinstance(tensor, LazyNumpyTensor):
            return cls.__quantize_lazy(tensor)
        else:
            return cls.__quantize_array(tensor)

    @classmethod
-    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor):
+    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
        if isinstance(tensor, LazyNumpyTensor):
            return cls.__dequantize_lazy(tensor)
        else: