parse gmml_type and llama_ftype, allow specifiying cfg file
This commit is contained in:
parent
6e09a26504
commit
238551ed8c
2 changed files with 86 additions and 134 deletions
|
@ -32,34 +32,55 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
{ "CUSTOM", LLAMA_FTYPE_CUSTOM, "per-layer scheme from file (quant.cfg)", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
{ "CUSTOM", LLAMA_FTYPE_CUSTOM, "[:filename] Custom quant config (quant.cfg if not specified", },
|
||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||
};
|
||||
|
||||
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out, std::string & custom_cfg_filename_out) {
|
||||
std::string ftype_str;
|
||||
|
||||
for (auto ch : ftype_str_in) {
|
||||
ftype_str.push_back(std::toupper(ch));
|
||||
}
|
||||
|
||||
if (ftype_str.find("CUSTOM:") == 0) {
|
||||
// custom quant mix
|
||||
ftype = LLAMA_FTYPE_CUSTOM;
|
||||
ftype_str_out = "CUSTOM";
|
||||
if (ftype_str.length() > 7) {
|
||||
// extract config filename (take from ftype_str_in to get original casing)
|
||||
std::string custom_cfg = ftype_str_in.substr(7);
|
||||
custom_cfg_filename_out = custom_cfg;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else if (ftype_str.find("CUSTOM") == 0) {
|
||||
// custom quant mix with default config
|
||||
ftype = LLAMA_FTYPE_CUSTOM;
|
||||
ftype_str_out = "CUSTOM";
|
||||
custom_cfg_filename_out = "quant.cfg";
|
||||
return true;
|
||||
}
|
||||
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
if (it.name == ftype_str) {
|
||||
ftype = it.ftype;
|
||||
|
@ -203,7 +224,7 @@ static ggml_type parse_ggml_type(const char * arg) {
|
|||
for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
|
||||
auto type = ggml_type(j);
|
||||
const auto * name = ggml_type_name(type);
|
||||
if (name && strcmp(arg, name) == 0) {
|
||||
if (name && strcasecmp(arg, name) == 0) {
|
||||
result = type; break;
|
||||
}
|
||||
}
|
||||
|
@ -253,7 +274,7 @@ static bool read_custom_quant_config(const std::string& filename, llama_model_qu
|
|||
std::vector<std::string> names;
|
||||
std::vector<ggml_type> types;
|
||||
|
||||
printf("%s: reading custom quantization scheme from %s:\n", __func__, filename.c_str());
|
||||
printf("reading custom quantization mix from %s:\n", filename.c_str());
|
||||
|
||||
if (!file.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open file: '%s'\n", __func__, filename.c_str());
|
||||
|
@ -261,25 +282,41 @@ static bool read_custom_quant_config(const std::string& filename, llama_model_qu
|
|||
}
|
||||
|
||||
while (getline(file, line)) {
|
||||
// Skip empty lines and comments
|
||||
// skip empty lines and comments
|
||||
if (line.empty() || line[0] == '#') continue;
|
||||
|
||||
// default file type
|
||||
if (line.find("ftype=") == 0) {
|
||||
int ftype = std::stoi(line.substr(6));
|
||||
std::string ftype_str = line.substr(6);
|
||||
std::string ftype_name;
|
||||
std::string custom_quant_config_filename;
|
||||
llama_ftype ftype;
|
||||
if(!try_parse_ftype(ftype_str, ftype, ftype_name, custom_quant_config_filename)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, ftype_str.c_str());
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
override.default_ftype = static_cast<llama_ftype>(ftype);
|
||||
printf(" default ftype = %i\n", ftype);
|
||||
printf(" default ftype = %i (%s)\n", ftype, ftype_name.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
// tensor overrides
|
||||
size_t pos = line.find('=');
|
||||
if (pos != std::string::npos) {
|
||||
std::string name = line.substr(0, pos);
|
||||
int type = std::stoi(line.substr(pos + 1));
|
||||
names.push_back(name);
|
||||
std::string tensor_name = line.substr(0, pos);
|
||||
std::string type_name = line.substr(pos + 1);
|
||||
ggml_type type = parse_ggml_type(type_name.c_str());
|
||||
if(type < 0 || type >= GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, type_name.c_str());
|
||||
file.close();
|
||||
return false;
|
||||
}
|
||||
names.push_back(tensor_name);
|
||||
types.push_back(static_cast<ggml_type>(type));
|
||||
printf(" %s = %i\n", name.c_str(), type);
|
||||
printf(" %s = %i (%s)\n", tensor_name.c_str(), type, type_name.c_str());
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -383,9 +420,10 @@ int main(int argc, char ** argv) {
|
|||
const std::string fname_inp = argv[arg_idx];
|
||||
arg_idx++;
|
||||
std::string fname_out;
|
||||
std::string custom_quant_config_filename;
|
||||
|
||||
std::string ftype_str;
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str, custom_quant_config_filename)) {
|
||||
std::string fpath;
|
||||
const size_t pos = fname_inp.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
|
@ -406,7 +444,7 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
|
||||
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str, custom_quant_config_filename)) {
|
||||
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
|
||||
return 1;
|
||||
}
|
||||
|
@ -417,8 +455,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (ftype_str == "CUSTOM") {
|
||||
params.override_ftype = new llama_model_quantize_ftype_override;
|
||||
if(!read_custom_quant_config("quant.cfg", *params.override_ftype)) {
|
||||
fprintf(stderr, "%s: failed to read custom quant config file!\n", __func__);
|
||||
if(!read_custom_quant_config(custom_quant_config_filename, *params.override_ftype)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
|
141
quant.cfg
141
quant.cfg
|
@ -1,121 +1,36 @@
|
|||
# this defines the default ftype (the quantization mix code,
|
||||
# Defines the default ftype (the quantization mix code,
|
||||
# that you pass to quantize if you're not using custom mix).
|
||||
# tensors that are not overriden below will be quantized
|
||||
# according to this scheme.
|
||||
# according to this mix.
|
||||
#
|
||||
# Must be one of
|
||||
# Q4_0, Q4_1, Q5_0, Q5_1, IQ2_XXS, IQ2_XS, IQ2_S, IQ2_M,
|
||||
# IQ1_S, IQ1_M, Q2_K, Q2_K_S, IQ3_XXS, IQ3_S, IQ3_M, Q3_K,
|
||||
# IQ3_XS, Q3_K_S, Q3_K_M, Q3_K_L, IQ4_NL, IQ4_XS, Q4_K,
|
||||
# Q4_K_S, Q4_K_M, Q5_K, Q5_K_S, Q5_K_M, Q6_K, Q8_0, F16
|
||||
|
||||
ftype=7
|
||||
|
||||
# allowed values:
|
||||
# LLAMA_FTYPE_ALL_F32 = 0,
|
||||
# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
ftype=Q6_K
|
||||
|
||||
# this defines an override for tensors with names matching
|
||||
# a given string. filters are processed in order given, and the
|
||||
# first matching will be used.
|
||||
# Defines overrides for tensors with names matching a given
|
||||
# string. Filters are processed in order given, the first
|
||||
# matching will be used.
|
||||
#
|
||||
# Wildcards are allowed:
|
||||
# ? single character
|
||||
# * multiple characters
|
||||
#
|
||||
# Type must be one of
|
||||
# F16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, Q2_K, Q3_K,
|
||||
# Q4_K, Q5_K, Q6_K, Q8_K, IQ2_XXS, IQ2_XS, IQ3_XXS,
|
||||
# IQ1_S, IQ4_NL, IQ3_S, IQ2_S, IQ4_XS, IQ1_M
|
||||
|
||||
blk.10.ffn_up.weight=7
|
||||
blk.1?.ffn_up.weight=10
|
||||
blk.2?.ffn_up.weight=10
|
||||
blk.1?.attn*=23
|
||||
blk.2?.attn*=23
|
||||
*down*=14
|
||||
*gate*=12
|
||||
|
||||
# allowed values:
|
||||
# LLAMA_FTYPE_ALL_F32 = 0,
|
||||
# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||
# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||
# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
|
||||
# GGML_TYPE_F32 = 0,
|
||||
# GGML_TYPE_F16 = 1,
|
||||
# GGML_TYPE_Q4_0 = 2,
|
||||
# GGML_TYPE_Q4_1 = 3,
|
||||
# // GGML_TYPE_Q4_2 = 4, support has been removed
|
||||
# // GGML_TYPE_Q4_3 = 5, support has been removed
|
||||
# GGML_TYPE_Q5_0 = 6,
|
||||
# GGML_TYPE_Q5_1 = 7,
|
||||
# GGML_TYPE_Q8_0 = 8,
|
||||
# GGML_TYPE_Q8_1 = 9,
|
||||
# GGML_TYPE_Q2_K = 10,
|
||||
# GGML_TYPE_Q3_K = 11,
|
||||
# GGML_TYPE_Q4_K = 12,
|
||||
# GGML_TYPE_Q5_K = 13,
|
||||
# GGML_TYPE_Q6_K = 14,
|
||||
# GGML_TYPE_Q8_K = 15,
|
||||
# GGML_TYPE_IQ2_XXS = 16,
|
||||
# GGML_TYPE_IQ2_XS = 17,
|
||||
# GGML_TYPE_IQ3_XXS = 18,
|
||||
# GGML_TYPE_IQ1_S = 19,
|
||||
# GGML_TYPE_IQ4_NL = 20,
|
||||
# GGML_TYPE_IQ3_S = 21,
|
||||
# GGML_TYPE_IQ2_S = 22,
|
||||
# GGML_TYPE_IQ4_XS = 23,
|
||||
# GGML_TYPE_I8 = 24,
|
||||
# GGML_TYPE_I16 = 25,
|
||||
# GGML_TYPE_I32 = 26,
|
||||
# GGML_TYPE_I64 = 27,
|
||||
# GGML_TYPE_F64 = 28,
|
||||
# GGML_TYPE_IQ1_M = 29,
|
||||
|
||||
blk.10.ffn_up.weight=Q5_K
|
||||
blk.1?.ffn_up.weight=Q4_K
|
||||
blk.23.*=Q2_K
|
||||
blk.24.*=Q2_K
|
||||
blk.25.*=Q2_K
|
||||
blk.2?.ffn_up.weight=Q4_K
|
||||
*_gate*=Q4_K
|
||||
*.attn*=IQ4_XS
|
||||
*_down*=IQ3_S
|
||||
output.weight=Q5_K
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue