gguf-py : remove LlamaFileTypeMap
Too specific to 'llama.cpp', and would be a maintenance burden to keep up to date. * gguf-py : add generic quantize and dequantize functions The quant classes no longer need to be known, only the target or the source type, for 'quantize' and 'dequantize', respectively.
This commit is contained in:
parent
e82ff5a346
commit
229c35cb59
4 changed files with 54 additions and 58 deletions
|
@ -301,20 +301,25 @@ class Model:
|
||||||
):
|
):
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
data_qtype = gguf.LlamaFileTypeMap.get(self.ftype, gguf.GGMLQuantizationType.F32)
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
if data_qtype == gguf.GGMLQuantizationType.Q8_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||||
if gguf.quants.Q8_0.can_quantize(data):
|
|
||||||
data = gguf.quants.Q8_0.quantize(data)
|
|
||||||
else: # fallback to f16
|
|
||||||
data_qtype = gguf.GGMLQuantizationType.F16
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
if data_qtype == gguf.GGMLQuantizationType.BF16:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||||
data = gguf.quants.BF16.quantize(data)
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
if data_qtype == gguf.GGMLQuantizationType.F16:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
data = data.astype(np.float16, copy=False)
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
if data_qtype == gguf.GGMLQuantizationType.F32:
|
else:
|
||||||
data = data.astype(np.float32, copy=False)
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = gguf.quants.quantize(data, data_qtype)
|
||||||
|
except gguf.QuantError as e:
|
||||||
|
logger.warning("%s, %s", e, "falling back to F16")
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
data = gguf.quants.quantize(data, data_qtype)
|
||||||
|
|
||||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||||
|
|
||||||
|
|
|
@ -1196,47 +1196,6 @@ class LlamaFileType(IntEnum):
|
||||||
GUESSED = 1024 # not specified in the model file
|
GUESSED = 1024 # not specified in the model file
|
||||||
|
|
||||||
|
|
||||||
# Default quantization type for each file type
|
|
||||||
# Keep this the same as in llama_model_quantize_internal from llama.cpp
|
|
||||||
LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
|
|
||||||
LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
|
|
||||||
LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
|
|
||||||
LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
|
|
||||||
LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
|
|
||||||
LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
|
|
||||||
LlamaFileType.MOSTLY_F16: GGMLQuantizationType.F16,
|
|
||||||
LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
|
|
||||||
LlamaFileType.ALL_F32: GGMLQuantizationType.F32,
|
|
||||||
|
|
||||||
# K-quants
|
|
||||||
LlamaFileType.MOSTLY_Q2_K_S: GGMLQuantizationType.Q2_K,
|
|
||||||
LlamaFileType.MOSTLY_Q2_K: GGMLQuantizationType.Q2_K,
|
|
||||||
LlamaFileType.MOSTLY_IQ3_XS: GGMLQuantizationType.IQ3_S,
|
|
||||||
LlamaFileType.MOSTLY_Q3_K_S: GGMLQuantizationType.Q3_K,
|
|
||||||
LlamaFileType.MOSTLY_Q3_K_M: GGMLQuantizationType.Q3_K,
|
|
||||||
LlamaFileType.MOSTLY_Q3_K_L: GGMLQuantizationType.Q3_K,
|
|
||||||
LlamaFileType.MOSTLY_Q4_K_S: GGMLQuantizationType.Q4_K,
|
|
||||||
LlamaFileType.MOSTLY_Q4_K_M: GGMLQuantizationType.Q4_K,
|
|
||||||
LlamaFileType.MOSTLY_Q5_K_S: GGMLQuantizationType.Q5_K,
|
|
||||||
LlamaFileType.MOSTLY_Q5_K_M: GGMLQuantizationType.Q5_K,
|
|
||||||
LlamaFileType.MOSTLY_Q6_K: GGMLQuantizationType.Q6_K,
|
|
||||||
LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
|
|
||||||
LlamaFileType.MOSTLY_IQ2_XS: GGMLQuantizationType.IQ2_XS,
|
|
||||||
LlamaFileType.MOSTLY_IQ2_S: GGMLQuantizationType.IQ2_XS,
|
|
||||||
LlamaFileType.MOSTLY_IQ2_M: GGMLQuantizationType.IQ2_S,
|
|
||||||
LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
|
|
||||||
LlamaFileType.MOSTLY_IQ1_S: GGMLQuantizationType.IQ1_S,
|
|
||||||
LlamaFileType.MOSTLY_IQ1_M: GGMLQuantizationType.IQ1_M,
|
|
||||||
LlamaFileType.MOSTLY_IQ4_NL: GGMLQuantizationType.IQ4_NL,
|
|
||||||
LlamaFileType.MOSTLY_IQ4_XS: GGMLQuantizationType.IQ4_XS,
|
|
||||||
LlamaFileType.MOSTLY_IQ3_S: GGMLQuantizationType.IQ3_S,
|
|
||||||
LlamaFileType.MOSTLY_IQ3_M: GGMLQuantizationType.IQ3_S,
|
|
||||||
LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
|
|
||||||
LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
|
|
||||||
LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class GGUFEndian(IntEnum):
|
class GGUFEndian(IntEnum):
|
||||||
LITTLE = 0
|
LITTLE = 0
|
||||||
BIG = 1
|
BIG = 1
|
||||||
|
|
|
@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||||
class LazyNumpyTensor(LazyBase):
|
class LazyNumpyTensor(LazyBase):
|
||||||
_tensor_type = np.ndarray
|
_tensor_type = np.ndarray
|
||||||
|
|
||||||
|
shape: tuple[int, ...] # Makes the type checker happy in quants.py
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
||||||
# The initial idea was to use np.nan as the fill value,
|
# The initial idea was to use np.nan as the fill value,
|
||||||
|
|
|
@ -46,6 +46,32 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||||
return np.sign(n) * b
|
return np.sign(n) * b
|
||||||
|
|
||||||
|
|
||||||
|
class QuantError(Exception): ...
|
||||||
|
|
||||||
|
|
||||||
|
_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||||
|
if qtype == GGMLQuantizationType.F32:
|
||||||
|
return data.astype(np.float32, copy=False)
|
||||||
|
elif qtype == GGMLQuantizationType.F16:
|
||||||
|
return data.astype(np.float16, copy=False)
|
||||||
|
elif (q := _type_traits.get(qtype)) is not None:
|
||||||
|
return q.quantize(data)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
|
||||||
|
|
||||||
|
|
||||||
|
def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||||
|
if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
|
||||||
|
return data.astype(np.float32, copy=False)
|
||||||
|
elif (q := _type_traits.get(qtype)) is not None:
|
||||||
|
return q.dequantize(data)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
|
||||||
|
|
||||||
|
|
||||||
class __Quant(ABC):
|
class __Quant(ABC):
|
||||||
qtype: GGMLQuantizationType
|
qtype: GGMLQuantizationType
|
||||||
block_size: int
|
block_size: int
|
||||||
|
@ -65,6 +91,8 @@ class __Quant(ABC):
|
||||||
cls.__dequantize_array,
|
cls.__dequantize_array,
|
||||||
meta_noop=(np.float32, cls.__shape_from_bytes)
|
meta_noop=(np.float32, cls.__shape_from_bytes)
|
||||||
)
|
)
|
||||||
|
assert qtype not in _type_traits
|
||||||
|
_type_traits[qtype] = cls
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -115,26 +143,28 @@ class __Quant(ABC):
|
||||||
return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
|
return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __quantize_lazy(cls, lazy_tensor: Any, /) -> Any:
|
def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __dequantize_lazy(cls, lazy_tensor: Any, /) -> Any:
|
def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def can_quantize(cls, tensor: np.ndarray) -> bool:
|
def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
|
||||||
return tensor.shape[-1] % cls.block_size == 0
|
return tensor.shape[-1] % cls.block_size == 0
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor):
|
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
|
||||||
|
if not cls.can_quantize(tensor):
|
||||||
|
raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
|
||||||
if isinstance(tensor, LazyNumpyTensor):
|
if isinstance(tensor, LazyNumpyTensor):
|
||||||
return cls.__quantize_lazy(tensor)
|
return cls.__quantize_lazy(tensor)
|
||||||
else:
|
else:
|
||||||
return cls.__quantize_array(tensor)
|
return cls.__quantize_array(tensor)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor):
|
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
|
||||||
if isinstance(tensor, LazyNumpyTensor):
|
if isinstance(tensor, LazyNumpyTensor):
|
||||||
return cls.__dequantize_lazy(tensor)
|
return cls.__dequantize_lazy(tensor)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue