From dbe6483e7ef564f6900ecae209e3178af877e0c7 Mon Sep 17 00:00:00 2001 From: Julia Bruckner Date: Tue, 23 Apr 2024 13:35:03 +0200 Subject: [PATCH] custom quantization schemas --- examples/quantize/quantize.cpp | 67 +++++++++++++++++++++++++++++++++- llama.cpp | 28 ++++++++++++-- llama.h | 11 +++++- 3 files changed, 100 insertions(+), 6 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 64cb6db19..2c22f8450 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -49,11 +49,11 @@ static const std::vector QUANT_OPTIONS = { { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + { "CUSTOM", LLAMA_FTYPE_CUSTOM, "per-layer scheme from file (quant.cfg)", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; - static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; @@ -247,6 +247,60 @@ static bool parse_kv_override(const char * data, std::vector names; + std::vector types; + + printf("%s: reading custom quantization scheme from %s:\n", __func__, filename.c_str()); + + if (!file.is_open()) { + fprintf(stderr, "%s: failed to open file: '%s'\n", __func__, filename.c_str()); + return false; + } + + while (getline(file, line)) { + // Skip empty lines and comments + if (line.empty() || line[0] == '#') continue; + printf(" %s\n", line.c_str()); + + // default file type + if (line.find("ftype=") == 0) { + int ftype = std::stoi(line.substr(6)); + override.default_ftype = static_cast(ftype); + printf(" default ftype = %i\n", ftype); + continue; + } + + // tensor overrides + size_t pos = line.find('='); + if (pos != std::string::npos) { + std::string name = line.substr(0, pos); + int type = std::stoi(line.substr(pos + 1)); + names.push_back(name); + types.push_back(static_cast(type)); + printf(" %s = %i\n", name.c_str(), type); + } + } + + printf("\n"); + + // allocate memory for names and types + override.names = new const char*[names.size()]; + override.types = new ggml_type[types.size()]; + override.count = names.size(); + + for (size_t i = 0; i < names.size(); ++i) { + override.names[i] = strdup(names[i].c_str()); + override.types[i] = types[i]; + } + + file.close(); + + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -352,13 +406,24 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: missing ftype\n", __func__); return 1; } + if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; } + if (ftype_str == "COPY") { params.only_copy = true; } + + if (ftype_str == "CUSTOM") { + params.override_ftype = new llama_model_quantize_ftype_override; + if(!read_custom_quant_config("quant.cfg", *params.override_ftype)) { + fprintf(stderr, "%s: failed to read custom quant config file!\n", __func__); + return 1; + } + } + arg_idx++; } diff --git a/llama.cpp b/llama.cpp index a25d115c1..bf48e38e3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3610,6 +3610,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; + // Custom quantization scheme + case LLAMA_FTYPE_CUSTOM: return "CUSTOM"; + default: return "unknown, may not work"; } } @@ -14195,9 +14198,13 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; - llama_ftype ftype = params->ftype; - switch (params->ftype) { + llama_ftype ftype = + params->override_ftype + ? params->override_ftype->default_ftype + : params->ftype; + + switch (ftype) { case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; @@ -14279,7 +14286,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // copy the KV pairs from the input file gguf_set_kv (ctx_out, ml.meta); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); - gguf_set_val_u32(ctx_out, "general.file_type", ftype); + gguf_set_val_u32(ctx_out, "general.file_type", params->ftype); // Remove split metadata gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); @@ -14417,6 +14424,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = params->output_tensor_type; } + // look up tensor name in type override map, if not found use default + // type as determined by the ftype. + if(params->override_ftype) { + for (uint32_t i = 0; i < params->override_ftype->count; ++i) { + if (strcmp(params->override_ftype->names[i], tensor->name) == 0) { + //LLAMA_LOG_INFO("\n%s: %s %s ---> %s\n", __func__, tensor->name, ggml_type_name(new_type), ggml_type_name(params->override_ftype->types[i])); + new_type = params->override_ftype->types[i]; + break; + } + } + } + // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. quantize = tensor->type != new_type; @@ -14886,7 +14905,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.only_copy =*/ false, /*.pure =*/ false, /*.imatrix =*/ nullptr, - /*.kv_overrides =*/ nullptr, + /*.kv_overrides =*/ nullptr, + /*.override_ftype =*/ nullptr }; return result; diff --git a/llama.h b/llama.h index 4effca42c..ea40345b3 100644 --- a/llama.h +++ b/llama.h @@ -122,6 +122,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors + LLAMA_FTYPE_CUSTOM = 32, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -278,6 +279,13 @@ extern "C" { void * abort_callback_data; }; + typedef struct llama_model_quantize_ftype_override { + enum llama_ftype default_ftype; // default type if not overriden + uint32_t count; // number of overrides + const char ** names; // tensor names + enum ggml_type * types; // tensor type override + } llama_model_quantize_custom_ftype; + // model quantization parameters typedef struct llama_model_quantize_params { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() @@ -286,10 +294,11 @@ extern "C" { enum ggml_type token_embedding_type; // itoken embeddings tensor type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool only_copy; // only copy tensors - ftype,override_ftype, allow_requantize and quantize_output_tensor are ignored bool pure; // quantize all tensors to the default type void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides + struct llama_model_quantize_ftype_override * override_ftype; // custom quantization scheme } llama_model_quantize_params; // grammar types