From 9ecc666b94ca377ea4c4ce7706ca87239eab3d77 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Mon, 1 Apr 2024 15:07:42 +0800 Subject: [PATCH] compatible with old and new minicpm versions --- convert-hf-to-gguf.py | 1 + gguf-py/gguf/constants.py | 2 ++ gguf-py/gguf/gguf_writer.py | 3 +++ llama.cpp | 15 +++++++++++++-- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 78a2b1c67..d00481d02 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1097,6 +1097,7 @@ class MiniCPMModel(Model): self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_tie_lm_head(self.hparams["tie_lm_head"]) def set_vocab(self): self._set_vocab_hf() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ee26224d4..9007c8add 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -43,6 +43,7 @@ class Keys: EXPERT_USED_COUNT = "{arch}.expert_used_count" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" + TIE_LM_HEAD = "{arch}.tie_lm_head" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -805,6 +806,7 @@ KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +KEY_TIE_LM_HEAD = Keys.LLM.TIE_LM_HEAD # attention KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2ae6c814b..10465ab5e 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -342,6 +342,9 @@ class GGUFWriter: def add_parallel_residual(self, use: bool) -> None: self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) + + def add_tie_lm_head(self, tie_lm_head: bool) -> None: + self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head) def add_head_count(self, count: int) -> None: self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) diff --git a/llama.cpp b/llama.cpp index 8363233fa..7337fc02c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -276,6 +276,7 @@ enum llm_kv { LLM_KV_EXPERT_USED_COUNT, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, + LLM_KV_TIE_LM_HEAD, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -320,6 +321,7 @@ enum llm_kv { LLM_KV_TOKENIZER_ADD_PREFIX, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, + }; static const std::map LLM_KV_NAMES = { @@ -345,6 +347,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, + { LLM_KV_TIE_LM_HEAD, "%s.tie_lm_head" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1707,6 +1710,7 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; + bool tie_lm_head = true; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -3503,6 +3507,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + ml.get_key(LLM_KV_TIE_LM_HEAD, hparams.tie_lm_head, false); GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); @@ -4375,7 +4380,9 @@ static bool llm_load_tensors( case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!hparams.tie_lm_head){ + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + } // output { @@ -8700,7 +8707,11 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + if (hparams.tie_lm_head){ + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + }else{ + cur = ggml_mul_mat(ctx0, model.output, cur); + } cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur);