diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c5d2d0b78..78a2b1c67 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1559,6 +1559,7 @@ class InternLM2Model(Model): self.gguf_writer.add_add_space_prefix(add_prefix) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + print(special_vocab) old_eos = special_vocab.special_token_ids["eos"] if "chat" in os.path.basename(self.dir_model.absolute()): # For the chat model, we replace the eos with '<|im_end|>'. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e2d07a631..2a19817d9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -795,7 +795,9 @@ int main(int argc, char ** argv) { } // deal with end of text token in interactive mode - if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { + auto last_token = llama_sampling_last(ctx_sampling); + if (last_token == llama_token_eos(model) || last_token == 122753) + { LOG("found EOS token\n"); if (params.interactive) { @@ -920,7 +922,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { + if (!embd.empty() && (embd.back() == llama_token_eos(model) || embd.back() == 122753) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4ab026482..ee26224d4 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -548,6 +548,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { ], MODEL_ARCH.MINICPM: [ MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, diff --git a/llama.cpp b/llama.cpp index 892d46fbc..8363233fa 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4375,6 +4375,7 @@ static bool llm_load_tensors( case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); // output { @@ -8699,7 +8700,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur);