convert_hf : simplify internal quantization type selection

This commit is contained in:
Francis Couture-Harpin 2024-08-02 16:14:49 -04:00
parent 1ac1a79161
commit 5e27e7e11c
2 changed files with 94 additions and 59 deletions

View file

@ -1145,6 +1145,9 @@ class GGMLQuantizationType(IntEnum):
F64 = 28
IQ1_M = 29
BF16 = 30
Q4_0_4_4 = 31
Q4_0_4_8 = 32
Q4_0_8_8 = 33
# TODO: add GGMLFileType from ggml_ftype in ggml.h
@ -1157,7 +1160,7 @@ class LlamaFileType(IntEnum):
MOSTLY_F16 = 1 # except 1d tensors
MOSTLY_Q4_0 = 2 # except 1d tensors
MOSTLY_Q4_1 = 3 # except 1d tensors
MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
# MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
# MOSTLY_Q4_2 = 5 # support has been removed
# MOSTLY_Q4_3 = 6 # support has been removed
MOSTLY_Q8_0 = 7 # except 1d tensors
@ -1186,10 +1189,53 @@ class LlamaFileType(IntEnum):
MOSTLY_IQ4_XS = 30 # except 1d tensors
MOSTLY_IQ1_M = 31 # except 1d tensors
MOSTLY_BF16 = 32 # except 1d tensors
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
GUESSED = 1024 # not specified in the model file
# Default quantization type for each file type
# Keep this the same as in llama_model_quantize_internal from llama.cpp
LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
LlamaFileType.MOSTLY_F16: GGMLQuantizationType.F16,
LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
LlamaFileType.ALL_F32: GGMLQuantizationType.F32,
# K-quants
LlamaFileType.MOSTLY_Q2_K_S: GGMLQuantizationType.Q2_K,
LlamaFileType.MOSTLY_Q2_K: GGMLQuantizationType.Q2_K,
LlamaFileType.MOSTLY_IQ3_XS: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_Q3_K_S: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q3_K_M: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q3_K_L: GGMLQuantizationType.Q3_K,
LlamaFileType.MOSTLY_Q4_K_S: GGMLQuantizationType.Q4_K,
LlamaFileType.MOSTLY_Q4_K_M: GGMLQuantizationType.Q4_K,
LlamaFileType.MOSTLY_Q5_K_S: GGMLQuantizationType.Q5_K,
LlamaFileType.MOSTLY_Q5_K_M: GGMLQuantizationType.Q5_K,
LlamaFileType.MOSTLY_Q6_K: GGMLQuantizationType.Q6_K,
LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
LlamaFileType.MOSTLY_IQ2_XS: GGMLQuantizationType.IQ2_XS,
LlamaFileType.MOSTLY_IQ2_S: GGMLQuantizationType.IQ2_XS,
LlamaFileType.MOSTLY_IQ2_M: GGMLQuantizationType.IQ2_S,
LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
LlamaFileType.MOSTLY_IQ1_S: GGMLQuantizationType.IQ1_S,
LlamaFileType.MOSTLY_IQ1_M: GGMLQuantizationType.IQ1_M,
LlamaFileType.MOSTLY_IQ4_NL: GGMLQuantizationType.IQ4_NL,
LlamaFileType.MOSTLY_IQ4_XS: GGMLQuantizationType.IQ4_XS,
LlamaFileType.MOSTLY_IQ3_S: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_IQ3_M: GGMLQuantizationType.IQ3_S,
LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
}
class GGUFEndian(IntEnum):
LITTLE = 0
BIG = 1
@ -1259,6 +1305,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
GGMLQuantizationType.F64: (1, 8),
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
GGMLQuantizationType.BF16: (1, 2),
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
}