From 229c35cb590b436947ac221a7ce5cb8da2b0eb2d Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 3 Aug 2024 21:22:37 -0400
Subject: [PATCH] gguf-py : remove LlamaFileTypeMap

Too specific to 'llama.cpp', and would be a maintenance burden
to keep up to date.

* gguf-py : add generic quantize and dequantize functions

The quant classes no longer need to be known,
only the target or the source type,
for 'quantize' and 'dequantize', respectively.
---
 convert_hf_to_gguf.py     | 29 +++++++++++++++------------
 gguf-py/gguf/constants.py | 41 ---------------------------------------
 gguf-py/gguf/lazy.py      |  2 ++
 gguf-py/gguf/quants.py    | 40 +++++++++++++++++++++++++++++++++-----
 4 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bfdf29a64..dac5baa69 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -301,20 +301,25 @@ class Model:
                 ):
                     data_qtype = gguf.GGMLQuantizationType.F32
 
+                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                 if isinstance(data_qtype, bool):
-                    data_qtype = gguf.LlamaFileTypeMap.get(self.ftype, gguf.GGMLQuantizationType.F32)
-
-                if data_qtype == gguf.GGMLQuantizationType.Q8_0:
-                    if gguf.quants.Q8_0.can_quantize(data):
-                        data = gguf.quants.Q8_0.quantize(data)
-                    else:  # fallback to f16
+                    if self.ftype == gguf.LlamaFileType.ALL_F32:
+                        data_qtype = gguf.GGMLQuantizationType.F32
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
                         data_qtype = gguf.GGMLQuantizationType.F16
-                if data_qtype == gguf.GGMLQuantizationType.BF16:
-                    data = gguf.quants.BF16.quantize(data)
-                if data_qtype == gguf.GGMLQuantizationType.F16:
-                    data = data.astype(np.float16, copy=False)
-                if data_qtype == gguf.GGMLQuantizationType.F32:
-                    data = data.astype(np.float32, copy=False)
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                        data_qtype = gguf.GGMLQuantizationType.BF16
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
+                        data_qtype = gguf.GGMLQuantizationType.Q8_0
+                    else:
+                        raise ValueError(f"Unknown file type: {self.ftype.name}")
+
+                try:
+                    data = gguf.quants.quantize(data, data_qtype)
+                except gguf.QuantError as e:
+                    logger.warning("%s, %s", e, "falling back to F16")
+                    data_qtype = gguf.GGMLQuantizationType.F16
+                    data = gguf.quants.quantize(data, data_qtype)
 
                 shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index af7408be9..cad99a6b1 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -1196,47 +1196,6 @@ class LlamaFileType(IntEnum):
     GUESSED              = 1024  # not specified in the model file
 
 
-# Default quantization type for each file type
-# Keep this the same as in llama_model_quantize_internal from llama.cpp
-LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
-    LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
-    LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
-    LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
-    LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
-    LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
-    LlamaFileType.MOSTLY_F16:  GGMLQuantizationType.F16,
-    LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
-    LlamaFileType.ALL_F32:     GGMLQuantizationType.F32,
-
-    # K-quants
-    LlamaFileType.MOSTLY_Q2_K_S:  GGMLQuantizationType.Q2_K,
-    LlamaFileType.MOSTLY_Q2_K:    GGMLQuantizationType.Q2_K,
-    LlamaFileType.MOSTLY_IQ3_XS:  GGMLQuantizationType.IQ3_S,
-    LlamaFileType.MOSTLY_Q3_K_S:  GGMLQuantizationType.Q3_K,
-    LlamaFileType.MOSTLY_Q3_K_M:  GGMLQuantizationType.Q3_K,
-    LlamaFileType.MOSTLY_Q3_K_L:  GGMLQuantizationType.Q3_K,
-    LlamaFileType.MOSTLY_Q4_K_S:  GGMLQuantizationType.Q4_K,
-    LlamaFileType.MOSTLY_Q4_K_M:  GGMLQuantizationType.Q4_K,
-    LlamaFileType.MOSTLY_Q5_K_S:  GGMLQuantizationType.Q5_K,
-    LlamaFileType.MOSTLY_Q5_K_M:  GGMLQuantizationType.Q5_K,
-    LlamaFileType.MOSTLY_Q6_K:    GGMLQuantizationType.Q6_K,
-    LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
-    LlamaFileType.MOSTLY_IQ2_XS:  GGMLQuantizationType.IQ2_XS,
-    LlamaFileType.MOSTLY_IQ2_S:   GGMLQuantizationType.IQ2_XS,
-    LlamaFileType.MOSTLY_IQ2_M:   GGMLQuantizationType.IQ2_S,
-    LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
-    LlamaFileType.MOSTLY_IQ1_S:   GGMLQuantizationType.IQ1_S,
-    LlamaFileType.MOSTLY_IQ1_M:   GGMLQuantizationType.IQ1_M,
-    LlamaFileType.MOSTLY_IQ4_NL:  GGMLQuantizationType.IQ4_NL,
-    LlamaFileType.MOSTLY_IQ4_XS:  GGMLQuantizationType.IQ4_XS,
-    LlamaFileType.MOSTLY_IQ3_S:   GGMLQuantizationType.IQ3_S,
-    LlamaFileType.MOSTLY_IQ3_M:   GGMLQuantizationType.IQ3_S,
-    LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
-    LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
-    LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
-}
-
-
 class GGUFEndian(IntEnum):
     LITTLE = 0
     BIG = 1
diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
index ac98d9a92..8d4fece2d 100644
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
 class LazyNumpyTensor(LazyBase):
     _tensor_type = np.ndarray
 
+    shape: tuple[int, ...]  # Makes the type checker happy in quants.py
+
     @classmethod
     def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
         # The initial idea was to use np.nan as the fill value,
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index a5a78a7bc..a443dd27e 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -46,6 +46,32 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
     return np.sign(n) * b
 
 
+class QuantError(Exception): ...
+
+
+_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
+
+
+def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32:
+        return data.astype(np.float32, copy=False)
+    elif qtype == GGMLQuantizationType.F16:
+        return data.astype(np.float16, copy=False)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.quantize(data)
+    else:
+        raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
+
+
+def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
+        return data.astype(np.float32, copy=False)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.dequantize(data)
+    else:
+        raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
+
+
 class __Quant(ABC):
     qtype: GGMLQuantizationType
     block_size: int
@@ -65,6 +91,8 @@ class __Quant(ABC):
             cls.__dequantize_array,
             meta_noop=(np.float32, cls.__shape_from_bytes)
         )
+        assert qtype not in _type_traits
+        _type_traits[qtype] = cls
 
     @classmethod
     @abstractmethod
@@ -115,26 +143,28 @@ class __Quant(ABC):
         return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
 
     @classmethod
-    def __quantize_lazy(cls, lazy_tensor: Any, /) -> Any:
+    def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
         pass
 
     @classmethod
-    def __dequantize_lazy(cls, lazy_tensor: Any, /) -> Any:
+    def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
         pass
 
     @classmethod
-    def can_quantize(cls, tensor: np.ndarray) -> bool:
+    def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
         return tensor.shape[-1] % cls.block_size == 0
 
     @classmethod
-    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor):
+    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
+        if not cls.can_quantize(tensor):
+            raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
         if isinstance(tensor, LazyNumpyTensor):
             return cls.__quantize_lazy(tensor)
         else:
             return cls.__quantize_array(tensor)
 
     @classmethod
-    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor):
+    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
         if isinstance(tensor, LazyNumpyTensor):
             return cls.__dequantize_lazy(tensor)
         else: