parse gmml_type and llama_ftype, allow specifiying cfg file

This commit is contained in:
Julia Bruckner 2024-04-25 11:42:09 +02:00
parent 6e09a26504
commit 238551ed8c
2 changed files with 86 additions and 134 deletions

View file

@ -32,34 +32,55 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
{ "CUSTOM", LLAMA_FTYPE_CUSTOM, "per-layer scheme from file (quant.cfg)", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
{ "CUSTOM", LLAMA_FTYPE_CUSTOM, "[:filename] Custom quant config (quant.cfg if not specified", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
};
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out, std::string & custom_cfg_filename_out) {
std::string ftype_str;
for (auto ch : ftype_str_in) {
ftype_str.push_back(std::toupper(ch));
}
if (ftype_str.find("CUSTOM:") == 0) {
// custom quant mix
ftype = LLAMA_FTYPE_CUSTOM;
ftype_str_out = "CUSTOM";
if (ftype_str.length() > 7) {
// extract config filename (take from ftype_str_in to get original casing)
std::string custom_cfg = ftype_str_in.substr(7);
custom_cfg_filename_out = custom_cfg;
} else {
return false;
}
return true;
} else if (ftype_str.find("CUSTOM") == 0) {
// custom quant mix with default config
ftype = LLAMA_FTYPE_CUSTOM;
ftype_str_out = "CUSTOM";
custom_cfg_filename_out = "quant.cfg";
return true;
}
for (auto & it : QUANT_OPTIONS) {
if (it.name == ftype_str) {
ftype = it.ftype;
@ -203,7 +224,7 @@ static ggml_type parse_ggml_type(const char * arg) {
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
auto type = ggml_type(j);
const auto * name = ggml_type_name(type);
if (name && strcmp(arg, name) == 0) {
if (name && strcasecmp(arg, name) == 0) {
result = type; break;
}
}
@ -253,7 +274,7 @@ static bool read_custom_quant_config(const std::string& filename, llama_model_qu
std::vector<std::string> names;
std::vector<ggml_type> types;
printf("%s: reading custom quantization scheme from %s:\n", __func__, filename.c_str());
printf("reading custom quantization mix from %s:\n", filename.c_str());
if (!file.is_open()) {
fprintf(stderr, "%s: failed to open file: '%s'\n", __func__, filename.c_str());
@ -261,25 +282,41 @@ static bool read_custom_quant_config(const std::string& filename, llama_model_qu
}
while (getline(file, line)) {
// Skip empty lines and comments
// skip empty lines and comments
if (line.empty() || line[0] == '#') continue;
// default file type
if (line.find("ftype=") == 0) {
int ftype = std::stoi(line.substr(6));
std::string ftype_str = line.substr(6);
std::string ftype_name;
std::string custom_quant_config_filename;
llama_ftype ftype;
if(!try_parse_ftype(ftype_str, ftype, ftype_name, custom_quant_config_filename)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, ftype_str.c_str());
file.close();
return false;
}
override.default_ftype = static_cast<llama_ftype>(ftype);
printf(" default ftype = %i\n", ftype);
printf(" default ftype = %i (%s)\n", ftype, ftype_name.c_str());
continue;
}
// tensor overrides
size_t pos = line.find('=');
if (pos != std::string::npos) {
std::string name = line.substr(0, pos);
int type = std::stoi(line.substr(pos + 1));
names.push_back(name);
std::string tensor_name = line.substr(0, pos);
std::string type_name = line.substr(pos + 1);
ggml_type type = parse_ggml_type(type_name.c_str());
if(type < 0 || type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, type_name.c_str());
file.close();
return false;
}
names.push_back(tensor_name);
types.push_back(static_cast<ggml_type>(type));
printf(" %s = %i\n", name.c_str(), type);
printf(" %s = %i (%s)\n", tensor_name.c_str(), type, type_name.c_str());
}
}
@ -383,9 +420,10 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[arg_idx];
arg_idx++;
std::string fname_out;
std::string custom_quant_config_filename;
std::string ftype_str;
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str, custom_quant_config_filename)) {
std::string fpath;
const size_t pos = fname_inp.find_last_of("/\\");
if (pos != std::string::npos) {
@ -406,7 +444,7 @@ int main(int argc, char ** argv) {
return 1;
}
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str, custom_quant_config_filename)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1;
}
@ -417,8 +455,7 @@ int main(int argc, char ** argv) {
if (ftype_str == "CUSTOM") {
params.override_ftype = new llama_model_quantize_ftype_override;
if(!read_custom_quant_config("quant.cfg", *params.override_ftype)) {
fprintf(stderr, "%s: failed to read custom quant config file!\n", __func__);
if(!read_custom_quant_config(custom_quant_config_filename, *params.override_ftype)) {
return 1;
}
}

141
quant.cfg
View file

@ -1,121 +1,36 @@
# this defines the default ftype (the quantization mix code,
# Defines the default ftype (the quantization mix code,
# that you pass to quantize if you're not using custom mix).
# tensors that are not overriden below will be quantized
# according to this scheme.
# according to this mix.
#
# Must be one of
# Q4_0, Q4_1, Q5_0, Q5_1, IQ2_XXS, IQ2_XS, IQ2_S, IQ2_M,
# IQ1_S, IQ1_M, Q2_K, Q2_K_S, IQ3_XXS, IQ3_S, IQ3_M, Q3_K,
# IQ3_XS, Q3_K_S, Q3_K_M, Q3_K_L, IQ4_NL, IQ4_XS, Q4_K,
# Q4_K_S, Q4_K_M, Q5_K, Q5_K_S, Q5_K_M, Q6_K, Q8_0, F16
ftype=7
# allowed values:
# LLAMA_FTYPE_ALL_F32 = 0,
# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
ftype=Q6_K
# this defines an override for tensors with names matching
# a given string. filters are processed in order given, and the
# first matching will be used.
# Defines overrides for tensors with names matching a given
# string. Filters are processed in order given, the first
# matching will be used.
#
# Wildcards are allowed:
# ? single character
# * multiple characters
#
# Type must be one of
# F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, Q2_K, Q3_K,
# Q4_K, Q5_K, Q6_K, Q8_K, IQ2_XXS, IQ2_XS, IQ3_XXS,
# IQ1_S, IQ4_NL, IQ3_S, IQ2_S, IQ4_XS, IQ1_M
blk.10.ffn_up.weight=7
blk.1?.ffn_up.weight=10
blk.2?.ffn_up.weight=10
blk.1?.attn*=23
blk.2?.attn*=23
*down*=14
*gate*=12
# allowed values:
# LLAMA_FTYPE_ALL_F32 = 0,
# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
# GGML_TYPE_F32 = 0,
# GGML_TYPE_F16 = 1,
# GGML_TYPE_Q4_0 = 2,
# GGML_TYPE_Q4_1 = 3,
# // GGML_TYPE_Q4_2 = 4, support has been removed
# // GGML_TYPE_Q4_3 = 5, support has been removed
# GGML_TYPE_Q5_0 = 6,
# GGML_TYPE_Q5_1 = 7,
# GGML_TYPE_Q8_0 = 8,
# GGML_TYPE_Q8_1 = 9,
# GGML_TYPE_Q2_K = 10,
# GGML_TYPE_Q3_K = 11,
# GGML_TYPE_Q4_K = 12,
# GGML_TYPE_Q5_K = 13,
# GGML_TYPE_Q6_K = 14,
# GGML_TYPE_Q8_K = 15,
# GGML_TYPE_IQ2_XXS = 16,
# GGML_TYPE_IQ2_XS = 17,
# GGML_TYPE_IQ3_XXS = 18,
# GGML_TYPE_IQ1_S = 19,
# GGML_TYPE_IQ4_NL = 20,
# GGML_TYPE_IQ3_S = 21,
# GGML_TYPE_IQ2_S = 22,
# GGML_TYPE_IQ4_XS = 23,
# GGML_TYPE_I8 = 24,
# GGML_TYPE_I16 = 25,
# GGML_TYPE_I32 = 26,
# GGML_TYPE_I64 = 27,
# GGML_TYPE_F64 = 28,
# GGML_TYPE_IQ1_M = 29,
blk.10.ffn_up.weight=Q5_K
blk.1?.ffn_up.weight=Q4_K
blk.23.*=Q2_K
blk.24.*=Q2_K
blk.25.*=Q2_K
blk.2?.ffn_up.weight=Q4_K
*_gate*=Q4_K
*.attn*=IQ4_XS
*_down*=IQ3_S
output.weight=Q5_K