allow quantize to work for split and merged experts models in the same way
This commit is contained in:
parent
4531b029ee
commit
6886fdb887
1 changed files with 15 additions and 9 deletions
24
llama.cpp
24
llama.cpp
|
@ -6336,7 +6336,7 @@ struct llm_build_context {
|
||||||
cur_gate = ggml_silu(ctx0, cur_gate);
|
cur_gate = ggml_silu(ctx0, cur_gate);
|
||||||
cb(cur_gate, "ffn_moe_silu", il);
|
cb(cur_gate, "ffn_moe_silu", il);
|
||||||
|
|
||||||
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
||||||
cb(cur_expert, "ffn_moe_gate_par", il);
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
||||||
|
|
||||||
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
||||||
|
@ -6871,7 +6871,7 @@ struct llm_build_context {
|
||||||
cur_gate = ggml_gelu(ctx0, cur_gate);
|
cur_gate = ggml_gelu(ctx0, cur_gate);
|
||||||
cb(cur_gate, "ffn_moe_gelu", il);
|
cb(cur_gate, "ffn_moe_gelu", il);
|
||||||
|
|
||||||
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
||||||
cb(cur_expert, "ffn_moe_gate_par", il);
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
||||||
|
|
||||||
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
||||||
|
@ -12945,9 +12945,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
// sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
|
||||||
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
||||||
// tensor name.
|
// tensor name.
|
||||||
|
|
||||||
// hack
|
|
||||||
//n_layer /= n_expert;
|
|
||||||
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
if (sscanf(name, "blk.%d.", &i_layer) != 1) {
|
||||||
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
|
||||||
}
|
}
|
||||||
|
@ -13371,10 +13368,19 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
qs.has_output = true;
|
qs.has_output = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) {
|
// REVIEW: i do not undertand why there is logic for counting the number of layers by counting the number of tensors
|
||||||
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
|
// instead of just using the n_layer metadata
|
||||||
__func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
|
// without this change, it would require different logic for merged experts and split experts models,
|
||||||
}
|
// as split expert models end with a ffn_* count n_expert times higher than the real number of layers,
|
||||||
|
// which then is corrected in layer_info by dividing the value by n_expert
|
||||||
|
// this code needs to be refactored
|
||||||
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||||
|
|
||||||
|
//if (qs.n_ffn_down )
|
||||||
|
//if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) {
|
||||||
|
// LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
|
||||||
|
// __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
|
||||||
|
//}
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue