diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1ee42da4d..1ad112f7c 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -7,15 +7,22 @@ #include static const std::map LLAMA_FTYPE_MAP = { - {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, - {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, - {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, - {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, - {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, - {"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K}, - {"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K}, - {"q5_K", LLAMA_FTYPE_MOSTLY_Q5_K}, - {"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K}, + {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0}, + {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1}, + {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0}, + {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1}, + {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0}, + {"q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M}, + {"q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S}, + {"q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M}, + {"q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L}, + {"q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M}, + {"q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S}, + {"q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M}, + {"q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M}, + {"q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S}, + {"q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M}, + {"q6_K", LLAMA_FTYPE_MOSTLY_Q6_K}, }; bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) { diff --git a/llama.cpp b/llama.cpp index ba0f4ad7d..34ca296ea 100644 --- a/llama.cpp +++ b/llama.cpp @@ -905,9 +905,14 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; - case LLAMA_FTYPE_MOSTLY_Q3_K: return "mostly Q3_K"; - case LLAMA_FTYPE_MOSTLY_Q4_K: return "mostly Q4_K"; - case LLAMA_FTYPE_MOSTLY_Q5_K: return "mostly Q5_K"; + // K-quants + case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; default: return "unknown, may not work"; } @@ -2074,12 +2079,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; - case LLAMA_FTYPE_MOSTLY_Q3_K: quantized_type = GGML_TYPE_Q3_K; break; - case LLAMA_FTYPE_MOSTLY_Q4_K: quantized_type = GGML_TYPE_Q4_K; break; - case LLAMA_FTYPE_MOSTLY_Q5_K: quantized_type = GGML_TYPE_Q5_K; break; + // K-quants + case LLAMA_FTYPE_MOSTLY_Q3_K_S: + case LLAMA_FTYPE_MOSTLY_Q3_K_M: + case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: + case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: + case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; default: throw format("invalid output file type %d\n", ftype); - }; + } if (nthread <= 0) { nthread = std::thread::hardware_concurrency(); @@ -2089,6 +2099,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s /*vocab_only*/ false)); llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); + int n_attention_wv = 0; + int n_feed_forward_w2 = 0; + for (auto& tensor : model_loader->tensors_map.tensors) { + if (tensor.name.find("attention.wv.weight") != std::string::npos) { + ++n_attention_wv; + } + else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + ++n_feed_forward_w2; + } + } + + int i_attention_wv = 0; + int i_feed_forward_w2 = 0; + size_t total_size_org = 0; size_t total_size_new = 0; std::vector hist_all(1 << 4, 0); @@ -2132,6 +2156,27 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { new_type = quantized_type; if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K; + else if (tensor.name.find("attention.wv.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && + (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 || + (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; + ++i_attention_wv; + } + else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && + (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 || + (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; + ++i_feed_forward_w2; + } + else if (tensor.name.find("feed_forward.w3.weight") != std::string::npos || + tensor.name.find("attention.wo.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } float * f32_data; size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); llama_buffer f32_conv_buf; diff --git a/llama.h b/llama.h index ffd22ae41..1ffc30963 100644 --- a/llama.h +++ b/llama.h @@ -94,10 +94,14 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K = 10,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_K = 11,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_K = 12,// except 1d tensors - LLAMA_FTYPE_MOSTLY_Q6_K = 13,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_S = 10,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_M = 11,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_L = 12,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_S = 13,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_K_M = 14,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_S = 15,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_K_M = 16,// except 1d tensors + LLAMA_FTYPE_MOSTLY_Q6_K = 17,// except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params();