convert_hf : simplify internal quantization type selection

2024-08-02 16:14:49 -04:00 · 2024-08-02 16:14:49 -04:00 · 5e27e7e11c
commit 5e27e7e11c
parent 1ac1a79161
2 changed files with 94 additions and 59 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -1145,6 +1145,9 @@ class GGMLQuantizationType(IntEnum):
    F64     = 28
    IQ1_M   = 29
    BF16    = 30
+    Q4_0_4_4 = 31
+    Q4_0_4_8 = 32
+    Q4_0_8_8 = 33


 # TODO: add GGMLFileType from ggml_ftype in ggml.h
@ -1157,7 +1160,7 @@ class LlamaFileType(IntEnum):
    MOSTLY_F16           = 1   # except 1d tensors
    MOSTLY_Q4_0          = 2   # except 1d tensors
    MOSTLY_Q4_1          = 3   # except 1d tensors
-    MOSTLY_Q4_1_SOME_F16 = 4   # tok_embeddings.weight and output.weight are F16
+    # MOSTLY_Q4_1_SOME_F16 = 4   # tok_embeddings.weight and output.weight are F16
    # MOSTLY_Q4_2        = 5   # support has been removed
    # MOSTLY_Q4_3        = 6   # support has been removed
    MOSTLY_Q8_0          = 7   # except 1d tensors
@ -1186,10 +1189,53 @@ class LlamaFileType(IntEnum):
    MOSTLY_IQ4_XS        = 30  # except 1d tensors
    MOSTLY_IQ1_M         = 31  # except 1d tensors
    MOSTLY_BF16          = 32  # except 1d tensors
+    MOSTLY_Q4_0_4_4      = 33  # except 1d tensors
+    MOSTLY_Q4_0_4_8      = 34  # except 1d tensors
+    MOSTLY_Q4_0_8_8      = 35  # except 1d tensors

    GUESSED              = 1024  # not specified in the model file


+# Default quantization type for each file type
+# Keep this the same as in llama_model_quantize_internal from llama.cpp
+LlamaFileTypeMap: dict[LlamaFileType, GGMLQuantizationType] = {
+    LlamaFileType.MOSTLY_Q4_0: GGMLQuantizationType.Q4_0,
+    LlamaFileType.MOSTLY_Q4_1: GGMLQuantizationType.Q4_1,
+    LlamaFileType.MOSTLY_Q5_0: GGMLQuantizationType.Q5_0,
+    LlamaFileType.MOSTLY_Q5_1: GGMLQuantizationType.Q5_1,
+    LlamaFileType.MOSTLY_Q8_0: GGMLQuantizationType.Q8_0,
+    LlamaFileType.MOSTLY_F16:  GGMLQuantizationType.F16,
+    LlamaFileType.MOSTLY_BF16: GGMLQuantizationType.BF16,
+    LlamaFileType.ALL_F32:     GGMLQuantizationType.F32,
+
+    # K-quants
+    LlamaFileType.MOSTLY_Q2_K_S:  GGMLQuantizationType.Q2_K,
+    LlamaFileType.MOSTLY_Q2_K:    GGMLQuantizationType.Q2_K,
+    LlamaFileType.MOSTLY_IQ3_XS:  GGMLQuantizationType.IQ3_S,
+    LlamaFileType.MOSTLY_Q3_K_S:  GGMLQuantizationType.Q3_K,
+    LlamaFileType.MOSTLY_Q3_K_M:  GGMLQuantizationType.Q3_K,
+    LlamaFileType.MOSTLY_Q3_K_L:  GGMLQuantizationType.Q3_K,
+    LlamaFileType.MOSTLY_Q4_K_S:  GGMLQuantizationType.Q4_K,
+    LlamaFileType.MOSTLY_Q4_K_M:  GGMLQuantizationType.Q4_K,
+    LlamaFileType.MOSTLY_Q5_K_S:  GGMLQuantizationType.Q5_K,
+    LlamaFileType.MOSTLY_Q5_K_M:  GGMLQuantizationType.Q5_K,
+    LlamaFileType.MOSTLY_Q6_K:    GGMLQuantizationType.Q6_K,
+    LlamaFileType.MOSTLY_IQ2_XXS: GGMLQuantizationType.IQ2_XXS,
+    LlamaFileType.MOSTLY_IQ2_XS:  GGMLQuantizationType.IQ2_XS,
+    LlamaFileType.MOSTLY_IQ2_S:   GGMLQuantizationType.IQ2_XS,
+    LlamaFileType.MOSTLY_IQ2_M:   GGMLQuantizationType.IQ2_S,
+    LlamaFileType.MOSTLY_IQ3_XXS: GGMLQuantizationType.IQ3_XXS,
+    LlamaFileType.MOSTLY_IQ1_S:   GGMLQuantizationType.IQ1_S,
+    LlamaFileType.MOSTLY_IQ1_M:   GGMLQuantizationType.IQ1_M,
+    LlamaFileType.MOSTLY_IQ4_NL:  GGMLQuantizationType.IQ4_NL,
+    LlamaFileType.MOSTLY_IQ4_XS:  GGMLQuantizationType.IQ4_XS,
+    LlamaFileType.MOSTLY_IQ3_S:   GGMLQuantizationType.IQ3_S,
+    LlamaFileType.MOSTLY_IQ3_M:   GGMLQuantizationType.IQ3_S,
+    LlamaFileType.MOSTLY_Q4_0_4_4: GGMLQuantizationType.Q4_0_4_4,
+    LlamaFileType.MOSTLY_Q4_0_4_8: GGMLQuantizationType.Q4_0_4_8,
+    LlamaFileType.MOSTLY_Q4_0_8_8: GGMLQuantizationType.Q4_0_8_8,
+}
+
 class GGUFEndian(IntEnum):
    LITTLE = 0
    BIG = 1
@ -1259,6 +1305,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.F64:     (1, 8),
    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
    GGMLQuantizationType.BF16:    (1, 2),
+    GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
+    GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
+    GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
 }