parse gmml_type and llama_ftype, allow specifiying cfg file

2024-04-25 11:42:09 +02:00 · 2024-04-25 11:42:09 +02:00 · 238551ed8c
commit 238551ed8c
parent 6e09a26504
2 changed files with 86 additions and 134 deletions
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -32,34 +32,55 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
    { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            },
    { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",        },
-    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M"                   },
    { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization"   ,          },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
    { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
    { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
-    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
+    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M",                  },
    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
-    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
+    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M",                  },
    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
-    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
-    { "CUSTOM", LLAMA_FTYPE_CUSTOM,        "per-layer scheme from file (quant.cfg)", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B",          },
+    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
+    { "CUSTOM", LLAMA_FTYPE_CUSTOM,        "[:filename] Custom quant config (quant.cfg if not specified", },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
-    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
+    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing",  },
 };

-static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
+static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out, std::string & custom_cfg_filename_out) {
    std::string ftype_str;

    for (auto ch : ftype_str_in) {
        ftype_str.push_back(std::toupper(ch));
    }
+
+    if (ftype_str.find("CUSTOM:") == 0) {
+        // custom quant mix
+        ftype = LLAMA_FTYPE_CUSTOM;
+        ftype_str_out = "CUSTOM";
+        if (ftype_str.length() > 7) {
+            // extract config filename (take from ftype_str_in to get original casing)
+            std::string custom_cfg = ftype_str_in.substr(7);
+            custom_cfg_filename_out = custom_cfg;
+        } else {
+            return false;
+        }
+        return true;
+    } else if (ftype_str.find("CUSTOM") == 0) {
+        // custom quant mix with default config
+        ftype = LLAMA_FTYPE_CUSTOM;
+        ftype_str_out = "CUSTOM";
+        custom_cfg_filename_out = "quant.cfg";
+        return true;
+    }
+
    for (auto & it : QUANT_OPTIONS) {
        if (it.name == ftype_str) {
            ftype = it.ftype;
@ -203,7 +224,7 @@ static ggml_type parse_ggml_type(const char * arg) {
    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
        auto type = ggml_type(j);
        const auto * name = ggml_type_name(type);
-        if (name && strcmp(arg, name) == 0) {
+        if (name && strcasecmp(arg, name) == 0) {
            result = type; break;
        }
    }
@ -253,7 +274,7 @@ static bool read_custom_quant_config(const std::string& filename, llama_model_qu
    std::vector<std::string> names;
    std::vector<ggml_type> types;

-    printf("%s: reading custom quantization scheme from %s:\n", __func__, filename.c_str());
+    printf("reading custom quantization mix from %s:\n", filename.c_str());

    if (!file.is_open()) {
        fprintf(stderr, "%s: failed to open file: '%s'\n", __func__, filename.c_str());
@ -261,25 +282,41 @@ static bool read_custom_quant_config(const std::string& filename, llama_model_qu
    }

    while (getline(file, line)) {
-        // Skip empty lines and comments
+        // skip empty lines and comments
        if (line.empty() || line[0] == '#') continue;

        // default file type
        if (line.find("ftype=") == 0) {
-            int ftype = std::stoi(line.substr(6));
+            std::string ftype_str = line.substr(6);
+            std::string ftype_name;
+            std::string custom_quant_config_filename;
+            llama_ftype ftype;
+            if(!try_parse_ftype(ftype_str, ftype, ftype_name, custom_quant_config_filename)) {
+                fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, ftype_str.c_str());
+                file.close();
+                return false;
+            }
+
            override.default_ftype = static_cast<llama_ftype>(ftype);
-            printf("  default ftype = %i\n", ftype);
+            printf("  default ftype = %i (%s)\n", ftype, ftype_name.c_str());
            continue;
        }

        // tensor overrides
        size_t pos = line.find('=');
        if (pos != std::string::npos) {
-            std::string name = line.substr(0, pos);
-            int type = std::stoi(line.substr(pos + 1));
-            names.push_back(name);
+            std::string tensor_name = line.substr(0, pos);
+            std::string type_name = line.substr(pos + 1);
+            ggml_type type = parse_ggml_type(type_name.c_str());
+            if(type < 0 || type >= GGML_TYPE_COUNT) {
+                fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, type_name.c_str());
+                file.close();
+                return false;
+            }
+            names.push_back(tensor_name);
            types.push_back(static_cast<ggml_type>(type));
-            printf("  %s = %i\n", name.c_str(), type);
+            printf("  %s = %i (%s)\n", tensor_name.c_str(), type, type_name.c_str());
+        
        }
    }

@ -383,9 +420,10 @@ int main(int argc, char ** argv) {
    const std::string fname_inp = argv[arg_idx];
    arg_idx++;
    std::string fname_out;
+    std::string custom_quant_config_filename;

    std::string ftype_str;
-    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
+    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str, custom_quant_config_filename)) {
        std::string fpath;
        const size_t pos = fname_inp.find_last_of("/\\");
        if (pos != std::string::npos) {
@ -406,7 +444,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
       
-        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
+        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str, custom_quant_config_filename)) {
            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
            return 1;
        }
@ -417,8 +455,7 @@ int main(int argc, char ** argv) {
       
        if (ftype_str == "CUSTOM") {
            params.override_ftype = new llama_model_quantize_ftype_override;
-            if(!read_custom_quant_config("quant.cfg", *params.override_ftype)) {
-                fprintf(stderr, "%s: failed to read custom quant config file!\n", __func__);
+            if(!read_custom_quant_config(custom_quant_config_filename, *params.override_ftype)) {
                return 1;
            }
        }
--- a/quant.cfg
+++ b/quant.cfg
@ -1,121 +1,36 @@
-# this defines the default ftype (the quantization mix code, 
+# Defines the default ftype (the quantization mix code, 
 # that you pass to quantize if you're not using custom mix).
 # tensors that are not overriden below will be quantized 
-# according to this scheme.
+# according to this mix.
+#
+# Must be one of
+#    Q4_0, Q4_1, Q5_0, Q5_1, IQ2_XXS, IQ2_XS, IQ2_S, IQ2_M, 
+#    IQ1_S, IQ1_M, Q2_K, Q2_K_S, IQ3_XXS, IQ3_S, IQ3_M, Q3_K,
+#    IQ3_XS, Q3_K_S, Q3_K_M, Q3_K_L, IQ4_NL, IQ4_XS, Q4_K, 
+#    Q4_K_S, Q4_K_M, Q5_K, Q5_K_S, Q5_K_M, Q6_K, Q8_0, F16

-ftype=7
- 
-# allowed values:
-# LLAMA_FTYPE_ALL_F32              = 0,
-# LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
-# // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
-# // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
-# LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+ftype=Q6_K

-# this defines an override for tensors with names matching
-# a given string. filters are processed in order given, and the
-# first matching will be used. 
+# Defines overrides for tensors with names matching a given 
+# string. Filters are processed in order given, the first 
+# matching will be used. 
+#
 # Wildcards are allowed:
 #     ? single character
 #     * multiple characters
+#
+# Type must be one of 
+#     F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, Q2_K, Q3_K, 
+#     Q4_K, Q5_K, Q6_K, Q8_K, IQ2_XXS, IQ2_XS, IQ3_XXS, 
+#     IQ1_S, IQ4_NL, IQ3_S, IQ2_S, IQ4_XS, IQ1_M

-blk.10.ffn_up.weight=7
-blk.1?.ffn_up.weight=10
-blk.2?.ffn_up.weight=10
-blk.1?.attn*=23
-blk.2?.attn*=23
-*down*=14
-*gate*=12
-
-# allowed values:
-# LLAMA_FTYPE_ALL_F32              = 0,
-# LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
-# // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
-# // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
-# LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
-# LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
-
-# GGML_TYPE_F32     = 0,
-# GGML_TYPE_F16     = 1,
-# GGML_TYPE_Q4_0    = 2,
-# GGML_TYPE_Q4_1    = 3,
-# // GGML_TYPE_Q4_2 = 4, support has been removed
-# // GGML_TYPE_Q4_3 = 5, support has been removed
-# GGML_TYPE_Q5_0    = 6,
-# GGML_TYPE_Q5_1    = 7,
-# GGML_TYPE_Q8_0    = 8,
-# GGML_TYPE_Q8_1    = 9,
-# GGML_TYPE_Q2_K    = 10,
-# GGML_TYPE_Q3_K    = 11,
-# GGML_TYPE_Q4_K    = 12,
-# GGML_TYPE_Q5_K    = 13,
-# GGML_TYPE_Q6_K    = 14,
-# GGML_TYPE_Q8_K    = 15,
-# GGML_TYPE_IQ2_XXS = 16,
-# GGML_TYPE_IQ2_XS  = 17,
-# GGML_TYPE_IQ3_XXS = 18,
-# GGML_TYPE_IQ1_S   = 19,
-# GGML_TYPE_IQ4_NL  = 20,
-# GGML_TYPE_IQ3_S   = 21,
-# GGML_TYPE_IQ2_S   = 22,
-# GGML_TYPE_IQ4_XS  = 23,
-# GGML_TYPE_I8      = 24,
-# GGML_TYPE_I16     = 25,
-# GGML_TYPE_I32     = 26,
-# GGML_TYPE_I64     = 27,
-# GGML_TYPE_F64     = 28,
-# GGML_TYPE_IQ1_M   = 29,
-
+blk.10.ffn_up.weight=Q5_K
+blk.1?.ffn_up.weight=Q4_K
+blk.23.*=Q2_K
+blk.24.*=Q2_K
+blk.25.*=Q2_K
+blk.2?.ffn_up.weight=Q4_K
+*_gate*=Q4_K
+*.attn*=IQ4_XS
+*_down*=IQ3_S
+output.weight=Q5_K