gguf-py : remove LlamaFileTypeMap

Too specific to 'llama.cpp', and would be a maintenance burden
to keep up to date.

* gguf-py : add generic quantize and dequantize functions

The quant classes no longer need to be known,
only the target or the source type,
for 'quantize' and 'dequantize', respectively.
This commit is contained in:
Francis Couture-Harpin 2024-08-03 21:22:37 -04:00
parent e82ff5a346
commit 229c35cb59
4 changed files with 54 additions and 58 deletions

View file

@ -301,20 +301,25 @@ class Model:
): ):
data_qtype = gguf.GGMLQuantizationType.F32 data_qtype = gguf.GGMLQuantizationType.F32
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
if isinstance(data_qtype, bool): if isinstance(data_qtype, bool):
data_qtype = gguf.LlamaFileTypeMap.get(self.ftype, gguf.GGMLQuantizationType.F32) if self.ftype == gguf.LlamaFileType.ALL_F32:
data_qtype = gguf.GGMLQuantizationType.F32
if data_qtype == gguf.GGMLQuantizationType.Q8_0: elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
if gguf.quants.Q8_0.can_quantize(data):
data = gguf.quants.Q8_0.quantize(data)
else: # fallback to f16
data_qtype = gguf.GGMLQuantizationType.F16 data_qtype = gguf.GGMLQuantizationType.F16
if data_qtype == gguf.GGMLQuantizationType.BF16: elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
data = gguf.quants.BF16.quantize(data) data_qtype = gguf.GGMLQuantizationType.BF16
if data_qtype == gguf.GGMLQuantizationType.F16: elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
data = data.astype(np.float16, copy=False) data_qtype = gguf.GGMLQuantizationType.Q8_0
if data_qtype == gguf.GGMLQuantizationType.F32: else:
data = data.astype(np.float32, copy=False) raise ValueError(f"Unknown file type: {self.ftype.name}")
try:
data = gguf.quants.quantize(data, data_qtype)
except gguf.QuantError as e:
logger.warning("%s, %s", e, "falling back to F16")
data_qtype = gguf.GGMLQuantizationType.F16
data = gguf.quants.quantize(data, data_qtype)
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape

View file

@ -1196,47 +1196,6 @@ class LlamaFileType(IntEnum):
GUESSED = 1024 # not specified in the model file GUESSED = 1024 # not specified in the model file
# Default quantization type for each file type
# Keep this the same as in llama_model_quantize_internal from llama.cpp
LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
LlamaFileType.MOSTLY_F16: GGMLQuantizationType.F16,
LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
LlamaFileType.ALL_F32: GGMLQuantizationType.F32,
# K-quants
LlamaFileType.MOSTLY_Q2_K_S: GGMLQuantizationType.Q2_K,
LlamaFileType.MOSTLY_Q2_K: GGMLQuantizationType.Q2_K,
LlamaFileType.MOSTLY_IQ3_XS: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_Q3_K_S: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q3_K_M: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q3_K_L: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q4_K_S: GGMLQuantizationType.Q4_K,
LlamaFileType.MOSTLY_Q4_K_M: GGMLQuantizationType.Q4_K,
LlamaFileType.MOSTLY_Q5_K_S: GGMLQuantizationType.Q5_K,
LlamaFileType.MOSTLY_Q5_K_M: GGMLQuantizationType.Q5_K,
LlamaFileType.MOSTLY_Q6_K: GGMLQuantizationType.Q6_K,
LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
LlamaFileType.MOSTLY_IQ2_XS: GGMLQuantizationType.IQ2_XS,
LlamaFileType.MOSTLY_IQ2_S: GGMLQuantizationType.IQ2_XS,
LlamaFileType.MOSTLY_IQ2_M: GGMLQuantizationType.IQ2_S,
LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
LlamaFileType.MOSTLY_IQ1_S: GGMLQuantizationType.IQ1_S,
LlamaFileType.MOSTLY_IQ1_M: GGMLQuantizationType.IQ1_M,
LlamaFileType.MOSTLY_IQ4_NL: GGMLQuantizationType.IQ4_NL,
LlamaFileType.MOSTLY_IQ4_XS: GGMLQuantizationType.IQ4_XS,
LlamaFileType.MOSTLY_IQ3_S: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_IQ3_M: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
}
class GGUFEndian(IntEnum): class GGUFEndian(IntEnum):
LITTLE = 0 LITTLE = 0
BIG = 1 BIG = 1

View file

@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
class LazyNumpyTensor(LazyBase): class LazyNumpyTensor(LazyBase):
_tensor_type = np.ndarray _tensor_type = np.ndarray
shape: tuple[int, ...] # Makes the type checker happy in quants.py
@classmethod @classmethod
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]: def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
# The initial idea was to use np.nan as the fill value, # The initial idea was to use np.nan as the fill value,

View file

@ -46,6 +46,32 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
return np.sign(n) * b return np.sign(n) * b
class QuantError(Exception): ...
_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
if qtype == GGMLQuantizationType.F32:
return data.astype(np.float32, copy=False)
elif qtype == GGMLQuantizationType.F16:
return data.astype(np.float16, copy=False)
elif (q := _type_traits.get(qtype)) is not None:
return q.quantize(data)
else:
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
return data.astype(np.float32, copy=False)
elif (q := _type_traits.get(qtype)) is not None:
return q.dequantize(data)
else:
raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
class __Quant(ABC): class __Quant(ABC):
qtype: GGMLQuantizationType qtype: GGMLQuantizationType
block_size: int block_size: int
@ -65,6 +91,8 @@ class __Quant(ABC):
cls.__dequantize_array, cls.__dequantize_array,
meta_noop=(np.float32, cls.__shape_from_bytes) meta_noop=(np.float32, cls.__shape_from_bytes)
) )
assert qtype not in _type_traits
_type_traits[qtype] = cls
@classmethod @classmethod
@abstractmethod @abstractmethod
@ -115,26 +143,28 @@ class __Quant(ABC):
return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape)) return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
@classmethod @classmethod
def __quantize_lazy(cls, lazy_tensor: Any, /) -> Any: def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
pass pass
@classmethod @classmethod
def __dequantize_lazy(cls, lazy_tensor: Any, /) -> Any: def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
pass pass
@classmethod @classmethod
def can_quantize(cls, tensor: np.ndarray) -> bool: def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
return tensor.shape[-1] % cls.block_size == 0 return tensor.shape[-1] % cls.block_size == 0
@classmethod @classmethod
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor): def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
if not cls.can_quantize(tensor):
raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
if isinstance(tensor, LazyNumpyTensor): if isinstance(tensor, LazyNumpyTensor):
return cls.__quantize_lazy(tensor) return cls.__quantize_lazy(tensor)
else: else:
return cls.__quantize_array(tensor) return cls.__quantize_array(tensor)
@classmethod @classmethod
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor): def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
if isinstance(tensor, LazyNumpyTensor): if isinstance(tensor, LazyNumpyTensor):
return cls.__dequantize_lazy(tensor) return cls.__dequantize_lazy(tensor)
else: else: