gguf-py : remove LlamaFileTypeMap

Too specific to 'llama.cpp', and would be a maintenance burden
to keep up to date.

* gguf-py : add generic quantize and dequantize functions

The quant classes no longer need to be known,
only the target or the source type,
for 'quantize' and 'dequantize', respectively.
This commit is contained in:
Francis Couture-Harpin 2024-08-03 21:22:37 -04:00
parent e82ff5a346
commit 229c35cb59
4 changed files with 54 additions and 58 deletions

View file

@ -1196,47 +1196,6 @@ class LlamaFileType(IntEnum):
GUESSED = 1024 # not specified in the model file
# Default quantization type for each file type
# Keep this the same as in llama_model_quantize_internal from llama.cpp
LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
LlamaFileType.MOSTLY_F16: GGMLQuantizationType.F16,
LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
LlamaFileType.ALL_F32: GGMLQuantizationType.F32,
# K-quants
LlamaFileType.MOSTLY_Q2_K_S: GGMLQuantizationType.Q2_K,
LlamaFileType.MOSTLY_Q2_K: GGMLQuantizationType.Q2_K,
LlamaFileType.MOSTLY_IQ3_XS: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_Q3_K_S: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q3_K_M: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q3_K_L: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q4_K_S: GGMLQuantizationType.Q4_K,
LlamaFileType.MOSTLY_Q4_K_M: GGMLQuantizationType.Q4_K,
LlamaFileType.MOSTLY_Q5_K_S: GGMLQuantizationType.Q5_K,
LlamaFileType.MOSTLY_Q5_K_M: GGMLQuantizationType.Q5_K,
LlamaFileType.MOSTLY_Q6_K: GGMLQuantizationType.Q6_K,
LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
LlamaFileType.MOSTLY_IQ2_XS: GGMLQuantizationType.IQ2_XS,
LlamaFileType.MOSTLY_IQ2_S: GGMLQuantizationType.IQ2_XS,
LlamaFileType.MOSTLY_IQ2_M: GGMLQuantizationType.IQ2_S,
LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
LlamaFileType.MOSTLY_IQ1_S: GGMLQuantizationType.IQ1_S,
LlamaFileType.MOSTLY_IQ1_M: GGMLQuantizationType.IQ1_M,
LlamaFileType.MOSTLY_IQ4_NL: GGMLQuantizationType.IQ4_NL,
LlamaFileType.MOSTLY_IQ4_XS: GGMLQuantizationType.IQ4_XS,
LlamaFileType.MOSTLY_IQ3_S: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_IQ3_M: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
}
class GGUFEndian(IntEnum):
LITTLE = 0
BIG = 1

View file

@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
class LazyNumpyTensor(LazyBase):
_tensor_type = np.ndarray
shape: tuple[int, ...] # Makes the type checker happy in quants.py
@classmethod
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
# The initial idea was to use np.nan as the fill value,

View file

@ -46,6 +46,32 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
return np.sign(n) * b
class QuantError(Exception): ...
_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
if qtype == GGMLQuantizationType.F32:
return data.astype(np.float32, copy=False)
elif qtype == GGMLQuantizationType.F16:
return data.astype(np.float16, copy=False)
elif (q := _type_traits.get(qtype)) is not None:
return q.quantize(data)
else:
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
return data.astype(np.float32, copy=False)
elif (q := _type_traits.get(qtype)) is not None:
return q.dequantize(data)
else:
raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
class __Quant(ABC):
qtype: GGMLQuantizationType
block_size: int
@ -65,6 +91,8 @@ class __Quant(ABC):
cls.__dequantize_array,
meta_noop=(np.float32, cls.__shape_from_bytes)
)
assert qtype not in _type_traits
_type_traits[qtype] = cls
@classmethod
@abstractmethod
@ -115,26 +143,28 @@ class __Quant(ABC):
return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
@classmethod
def __quantize_lazy(cls, lazy_tensor: Any, /) -> Any:
def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
pass
@classmethod
def __dequantize_lazy(cls, lazy_tensor: Any, /) -> Any:
def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
pass
@classmethod
def can_quantize(cls, tensor: np.ndarray) -> bool:
def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
return tensor.shape[-1] % cls.block_size == 0
@classmethod
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor):
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
if not cls.can_quantize(tensor):
raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
if isinstance(tensor, LazyNumpyTensor):
return cls.__quantize_lazy(tensor)
else:
return cls.__quantize_array(tensor)
@classmethod
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor):
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
if isinstance(tensor, LazyNumpyTensor):
return cls.__dequantize_lazy(tensor)
else: