compatible with old and new minicpm versions

This commit is contained in:
zhangkaihuo 2024-04-01 15:07:42 +08:00
parent e913ac9c38
commit 9ecc666b94
4 changed files with 19 additions and 2 deletions

View file

@ -1097,6 +1097,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"])
def set_vocab(self):
self._set_vocab_hf()

View file

@ -43,6 +43,7 @@ class Keys:
EXPERT_USED_COUNT = "{arch}.expert_used_count"
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"
TIE_LM_HEAD = "{arch}.tie_lm_head"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
@ -805,6 +806,7 @@ KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
KEY_TIE_LM_HEAD = Keys.LLM.TIE_LM_HEAD
# attention
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT

View file

@ -342,6 +342,9 @@ class GGUFWriter:
def add_parallel_residual(self, use: bool) -> None:
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_tie_lm_head(self, tie_lm_head: bool) -> None:
self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head)
def add_head_count(self, count: int) -> None:
self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count)

View file

@ -276,6 +276,7 @@ enum llm_kv {
LLM_KV_EXPERT_USED_COUNT,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_TIE_LM_HEAD,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -320,6 +321,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_ADD_PREFIX,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
};
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@ -345,6 +347,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_TIE_LM_HEAD, "%s.tie_lm_head" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@ -1707,6 +1710,7 @@ struct llama_hparams {
bool causal_attn = true;
bool need_kq_pos = false;
bool tie_lm_head = true;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@ -3503,6 +3507,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
ml.get_key(LLM_KV_TIE_LM_HEAD, hparams.tie_lm_head, false);
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@ -4375,7 +4380,9 @@ static bool llm_load_tensors(
case LLM_ARCH_MINICPM:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
if (!hparams.tie_lm_head){
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
}
// output
{
@ -8700,7 +8707,11 @@ struct llm_build_context {
cb(cur, "lmhead_scaling", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
if (hparams.tie_lm_head){
cur = ggml_mul_mat(ctx0, model.tok_embd, cur);
}else{
cur = ggml_mul_mat(ctx0, model.output, cur);
}
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);