Quantization imrovements for k_quants (#2707)
* Improve LLaMA-2 2-, 3- and 4-bit quantization * Q3_K_S: use Q5_K for 1st 2 layers of attention.wv and feed_forward.w2 * Q4_K_S: use Q6_K for 1st 2 layers of attention.wv and feed_forward.w2 * Q2_K and Q3_K_M: use Q5_K instead of Q4_K for 1st 2 layers of attention.wv and feed_forward.w2 This leads to a slight model sized increase as follows: Q2_K : 2.684G vs 2.670G Q3_K_S: 2.775G vs 2.745G Q3_K_M: 3.071G vs 3.057G Q4_K_S: 3.592G vs 3.563G LLaMA-2 PPL for context 512 changes as follows: Q2_K : 6.6691 vs 6.8201 Q3_K_S: 6.2129 vs 6.2584 Q3_K_M: 6.0387 vs 6.1371 Q4_K_S: 5.9138 vs 6.0041 There are improvements for LLaMA-1 as well, but they are way smaller than the above. * Minor 4-bit quantization improvement For the same model size as previus commit, we get PPL = 5.9069 vs 5.9138. * Some more fine tuning * Adding make_qkx2_quants With it, we get PPL = 5.8828 for L2-7B Q4_K_S. * Another minor improvement * Q2_K improvement Smaller model, lower perplexity. 7B: file size = 2.632G, PPL = 6.3772 vs original 2.670G PPL = 6.8201 12B: file size = 5.056G, PPL = 5.4577 vs original 5.130G PPL = 5.7178 It is mostly Q3_K except for tok_embeddings, attention.wq, attention.wk, which are Q2_K * Iterating * Revert Q5_K back to make_qkx1_quants * Better Q6_K * make_qkx2_quants is better for Q5_K after all * Fix after rebasing on master * Fix for changed tensor names --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
519c981f8b
commit
bac66994cf
2 changed files with 130 additions and 58 deletions
24
llama.cpp
24
llama.cpp
|
@ -3547,24 +3547,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
++i_attention_wv;
|
||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
||||
++i_feed_forward_w2;
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
}
|
||||
// This can be used to reduce the size of the Q5_K_S model.
|
||||
// The associated PPL increase is fully in line with the size reduction
|
||||
//else {
|
||||
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
||||
//}
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue