Adding quantization mixes
This commit is contained in:
parent
5c5191ab68
commit
d537b97cb8
3 changed files with 76 additions and 20 deletions
|
@ -7,15 +7,22 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
|
||||||
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
|
||||||
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
|
||||||
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
{"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
|
||||||
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
{"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
|
||||||
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
|
||||||
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K},
|
{"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
||||||
{"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K},
|
{"q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S},
|
||||||
{"q5_K", LLAMA_FTYPE_MOSTLY_Q5_K},
|
{"q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M},
|
||||||
{"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K},
|
{"q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L},
|
||||||
|
{"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M},
|
||||||
|
{"q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S},
|
||||||
|
{"q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M},
|
||||||
|
{"q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M},
|
||||||
|
{"q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S},
|
||||||
|
{"q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M},
|
||||||
|
{"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K},
|
||||||
};
|
};
|
||||||
|
|
||||||
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||||
|
|
59
llama.cpp
59
llama.cpp
|
@ -905,9 +905,14 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K: return "mostly Q3_K";
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_K: return "mostly Q4_K";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K: return "mostly Q5_K";
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
||||||
default: return "unknown, may not work";
|
default: return "unknown, may not work";
|
||||||
}
|
}
|
||||||
|
@ -2074,12 +2079,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q3_K: quantized_type = GGML_TYPE_Q3_K; break;
|
// K-quants
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_K: quantized_type = GGML_TYPE_Q4_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_K: quantized_type = GGML_TYPE_Q5_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||||
default: throw format("invalid output file type %d\n", ftype);
|
default: throw format("invalid output file type %d\n", ftype);
|
||||||
};
|
}
|
||||||
|
|
||||||
if (nthread <= 0) {
|
if (nthread <= 0) {
|
||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
|
@ -2089,6 +2099,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
/*vocab_only*/ false));
|
/*vocab_only*/ false));
|
||||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
||||||
|
|
||||||
|
int n_attention_wv = 0;
|
||||||
|
int n_feed_forward_w2 = 0;
|
||||||
|
for (auto& tensor : model_loader->tensors_map.tensors) {
|
||||||
|
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||||
|
++n_attention_wv;
|
||||||
|
}
|
||||||
|
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
|
++n_feed_forward_w2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int i_attention_wv = 0;
|
||||||
|
int i_feed_forward_w2 = 0;
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||||
|
@ -2132,6 +2156,27 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
} else {
|
} else {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
|
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
|
||||||
|
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
|
||||||
|
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
++i_attention_wv;
|
||||||
|
}
|
||||||
|
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
|
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
|
||||||
|
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
++i_feed_forward_w2;
|
||||||
|
}
|
||||||
|
else if (tensor.name.find("feed_forward.w3.weight") != std::string::npos ||
|
||||||
|
tensor.name.find("attention.wo.weight") != std::string::npos) {
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||||
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
|
}
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
||||||
llama_buffer f32_conv_buf;
|
llama_buffer f32_conv_buf;
|
||||||
|
|
12
llama.h
12
llama.h
|
@ -94,10 +94,14 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K = 10,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_S = 10,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_K = 11,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_M = 11,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_K = 12,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_L = 12,// except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q6_K = 13,// except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q4_K_S = 13,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_K_M = 14,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 15,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 16,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q6_K = 17,// except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue