From dbe6483e7ef564f6900ecae209e3178af877e0c7 Mon Sep 17 00:00:00 2001
From: Julia Bruckner <jubruckne@gmail.com>
Date: Tue, 23 Apr 2024 13:35:03 +0200
Subject: [PATCH] custom quantization schemas

---
 examples/quantize/quantize.cpp | 67 +++++++++++++++++++++++++++++++++-
 llama.cpp                      | 28 ++++++++++++--
 llama.h                        | 11 +++++-
 3 files changed, 100 insertions(+), 6 deletions(-)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 64cb6db19..2c22f8450 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -49,11 +49,11 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
+    { "CUSTOM", LLAMA_FTYPE_CUSTOM,        "per-layer scheme from file (quant.cfg)", },
     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
     { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };
 
-
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
     std::string ftype_str;
 
@@ -247,6 +247,60 @@ static bool parse_kv_override(const char * data, std::vector<llama_model_kv_over
     return true;
 }
 
+static bool read_custom_quant_config(const std::string& filename, llama_model_quantize_ftype_override& override) {
+    std::ifstream file(filename);
+    std::string line;
+    std::vector<std::string> names;
+    std::vector<ggml_type> types;
+
+    printf("%s: reading custom quantization scheme from %s:\n", __func__, filename.c_str());
+
+    if (!file.is_open()) {
+        fprintf(stderr, "%s: failed to open file: '%s'\n", __func__, filename.c_str());
+        return false;
+    }
+
+    while (getline(file, line)) {
+        // Skip empty lines and comments
+        if (line.empty() || line[0] == '#') continue;
+            printf("  %s\n", line.c_str());
+
+        // default file type
+        if (line.find("ftype=") == 0) {
+            int ftype = std::stoi(line.substr(6));
+            override.default_ftype = static_cast<llama_ftype>(ftype);
+            printf("  default ftype = %i\n", ftype);
+            continue;
+        }
+
+        // tensor overrides
+        size_t pos = line.find('=');
+        if (pos != std::string::npos) {
+            std::string name = line.substr(0, pos);
+            int type = std::stoi(line.substr(pos + 1));
+            names.push_back(name);
+            types.push_back(static_cast<ggml_type>(type));
+            printf("  %s = %i\n", name.c_str(), type);
+        }
+    }
+
+    printf("\n");
+
+    // allocate memory for names and types
+    override.names = new const char*[names.size()];
+    override.types = new ggml_type[types.size()];
+    override.count = names.size();
+
+    for (size_t i = 0; i < names.size(); ++i) {
+        override.names[i] = strdup(names[i].c_str());
+        override.types[i] = types[i];
+    }
+
+    file.close();
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -352,13 +406,24 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "%s: missing ftype\n", __func__);
             return 1;
         }
+       
         if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
             fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
             return 1;
         }
+       
         if (ftype_str == "COPY") {
            params.only_copy = true;
         }
+       
+        if (ftype_str == "CUSTOM") {
+            params.override_ftype = new llama_model_quantize_ftype_override;
+            if(!read_custom_quant_config("quant.cfg", *params.override_ftype)) {
+                fprintf(stderr, "%s: failed to read custom quant config file!\n", __func__);
+                return 1;
+            }
+        }
+       
         arg_idx++;
     }
 
diff --git a/llama.cpp b/llama.cpp
index a25d115c1..bf48e38e3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3610,6 +3610,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw";
 
+        // Custom quantization scheme
+        case LLAMA_FTYPE_CUSTOM:        return "CUSTOM";
+
         default: return "unknown, may not work";
     }
 }
@@ -14195,9 +14198,13 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
 
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type default_type;
-    llama_ftype ftype = params->ftype;
 
-    switch (params->ftype) {
+    llama_ftype ftype =
+        params->override_ftype 
+        ? params->override_ftype->default_ftype 
+        : params->ftype;
+
+    switch (ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
         case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
@@ -14279,7 +14286,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // copy the KV pairs from the input file
     gguf_set_kv     (ctx_out, ml.meta);
     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
-    gguf_set_val_u32(ctx_out, "general.file_type", ftype);
+    gguf_set_val_u32(ctx_out, "general.file_type", params->ftype);
     // Remove split metadata
     gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
     gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
@@ -14417,6 +14424,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 new_type = params->output_tensor_type;
             }
 
+            // look up tensor name in type override map, if not found use default
+            // type as determined by the ftype.
+            if(params->override_ftype) {
+                for (uint32_t i = 0; i < params->override_ftype->count; ++i) {
+                    if (strcmp(params->override_ftype->names[i], tensor->name) == 0) {
+                        //LLAMA_LOG_INFO("\n%s: %s %s ---> %s\n", __func__, tensor->name, ggml_type_name(new_type), ggml_type_name(params->override_ftype->types[i]));
+                        new_type = params->override_ftype->types[i];
+                        break;
+                    }
+                }
+            }
+
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             quantize = tensor->type != new_type;
@@ -14886,7 +14905,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.only_copy                   =*/ false,
         /*.pure                        =*/ false,
         /*.imatrix                     =*/ nullptr,
-        /*.kv_overrides                =*/ nullptr,
+        /*.kv_overrides                =*/ nullptr, 
+        /*.override_ftype              =*/ nullptr
     };
 
     return result;
diff --git a/llama.h b/llama.h
index 4effca42c..ea40345b3 100644
--- a/llama.h
+++ b/llama.h
@@ -122,6 +122,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+        LLAMA_FTYPE_CUSTOM               = 32, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
@@ -278,6 +279,13 @@ extern "C" {
         void *              abort_callback_data;
     };
 
+    typedef struct llama_model_quantize_ftype_override {
+        enum llama_ftype default_ftype; // default type if not overriden
+        uint32_t count;                 // number of overrides
+        const char ** names;            // tensor names
+        enum ggml_type * types;   // tensor type override
+    } llama_model_quantize_custom_ftype;
+
     // model quantization parameters
     typedef struct llama_model_quantize_params {
         int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
@@ -286,10 +294,11 @@ extern "C" {
         enum ggml_type token_embedding_type; // itoken embeddings tensor type
         bool allow_requantize;               // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool only_copy;                      // only copy tensors - ftype,override_ftype, allow_requantize and quantize_output_tensor are ignored
         bool pure;                           // quantize all tensors to the default type
         void * imatrix;                      // pointer to importance matrix data
         void * kv_overrides;                 // pointer to vector containing overrides
+        struct llama_model_quantize_ftype_override * override_ftype; // custom quantization scheme
     } llama_model_quantize_params;
 
     // grammar types