diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 278edccdc..4fd916cba 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1638,8 +1638,6 @@ class MiniCPMModel(Model): self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - if "tie_word_embeddings" in self.hparams: - self.gguf_writer.add_tie_lm_head(self.hparams["tie_word_embeddings"]) def set_vocab(self): self._set_vocab_llama_hf() diff --git a/examples/minicpmv/minicpm-surgery.py b/examples/minicpmv/minicpm-surgery.py index 85b498c97..5916744a0 100644 --- a/examples/minicpmv/minicpm-surgery.py +++ b/examples/minicpmv/minicpm-surgery.py @@ -47,7 +47,8 @@ with open(f"{args.model}/MiniCPM/tokenizer_config.json", "r") as f: d = json.load(f) d.pop("auto_map") d["tokenizer_class"] = "LlamaTokenizer" - d.pop("add_prefix_space") + if "add_prefix_space" in d: + d.pop("add_prefix_space") with open(f"{args.model}/MiniCPM/tokenizer_config.json", "w") as f: json.dump(d, f, indent=2) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cfd48d16d..6562c6660 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -44,7 +44,6 @@ class Keys: EXPERT_USED_COUNT = "{arch}.expert_used_count" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" - TIE_LM_HEAD = "{arch}.tie_lm_head" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -901,7 +900,6 @@ KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT -KEY_TIE_LM_HEAD = Keys.LLM.TIE_LM_HEAD # attention KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index b75dd472d..e3dbca454 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -350,9 +350,6 @@ class GGUFWriter: def add_parallel_residual(self, use: bool) -> None: self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) - def add_tie_lm_head(self, tie_lm_head: bool) -> None: - self.add_bool(Keys.LLM.TIE_LM_HEAD.format(arch=self.arch), tie_lm_head) - def add_head_count(self, count: int) -> None: self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) diff --git a/llama.cpp b/llama.cpp index 7894aac87..7f8d3ca57 100644 --- a/llama.cpp +++ b/llama.cpp @@ -285,7 +285,6 @@ enum llm_kv { LLM_KV_EXPERT_USED_COUNT, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, - LLM_KV_TIE_LM_HEAD, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -362,7 +361,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, - { LLM_KV_TIE_LM_HEAD, "%s.tie_lm_head" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1827,7 +1825,6 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - bool tie_lm_head = true; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -3314,6 +3311,7 @@ struct llama_model_loader { ggml_set_name(tensor, ggml_get_name(cur)); n_created++; + printf("%s: created tensor '%s'\n", __func__, ggml_get_name(tensor)); return tensor; } @@ -3382,6 +3380,8 @@ struct llama_model_loader { ggml_set_name(tensor, name.c_str()); n_created++; + printf("%s: created tensor '%s'\n", __func__, name.c_str()); + return tensor; } @@ -3699,7 +3699,6 @@ static void llm_load_hparams( ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); - ml.get_key(LLM_KV_TIE_LM_HEAD, hparams.tie_lm_head, false); GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); @@ -4711,8 +4710,12 @@ static bool llm_load_tensors( case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - if (!hparams.tie_lm_head){ - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); } // output @@ -4793,6 +4796,7 @@ static bool llm_load_tensors( if (model.output == NULL) { model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); ml.n_created--; // artificial tensor + printf("created tensor decrese GROK\n"); ml.size_data += ggml_nbytes(model.output); } } @@ -4922,6 +4926,7 @@ static bool llm_load_tensors( if (!model.output) { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor + printf("created tensor decrese FALCON\n"); ml.size_data += ggml_nbytes(model.output); } } @@ -5127,6 +5132,7 @@ static bool llm_load_tensors( if (!model.output) { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor + printf("created tensor decrese MPT\n"); ml.size_data += ggml_nbytes(model.output); } } @@ -5249,6 +5255,7 @@ static bool llm_load_tensors( if (model.output == NULL) { model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); ml.n_created--; // artificial tensor + printf("created tensor decrese QWEN2\n"); ml.size_data += ggml_nbytes(model.output); } } @@ -5539,6 +5546,7 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading ml.n_created--; // artificial tensor + printf("created tensor decrese GEMMA\n"); ml.size_data += ggml_nbytes(model.output); const int64_t n_ff = hparams.n_ff; @@ -5579,6 +5587,7 @@ static bool llm_load_tensors( if (model.output == NULL) { model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); ml.n_created--; // artificial tensor + printf("created tensor decrese STARCODER2\n"); ml.size_data += ggml_nbytes(model.output); } @@ -5635,6 +5644,7 @@ static bool llm_load_tensors( if (model.output == NULL) { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); ml.n_created--; // artificial tensor + printf("created tensor decrese MAMBA\n"); ml.size_data += ggml_nbytes(model.output); } } @@ -5698,6 +5708,7 @@ static bool llm_load_tensors( // init output from the input tok embed model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); ml.n_created--; // artificial tensor + printf("created tensor decrese COMMAND_R\n"); ml.size_data += ggml_nbytes(model.output); } @@ -5735,6 +5746,7 @@ static bool llm_load_tensors( if (model.output == NULL) { model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); ml.n_created--; // artificial tensor + printf("created tensor decrese OLMO\n"); ml.size_data += ggml_nbytes(model.output); } } @@ -9656,11 +9668,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - if (hparams.tie_lm_head){ - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); - } else { - cur = ggml_mul_mat(ctx0, model.output, cur); - } + cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur);