Fix tokenizer_clean_spaces for megrez

2024-12-22 14:50:59 +08:00 · 2024-12-22 14:50:59 +08:00 · 01a0c36e04
commit 01a0c36e04
parent a02c63d710
1 changed files with 4 additions and 2 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -6604,8 +6604,7 @@ static void llm_load_vocab(
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
                vocab.tokenizer_clean_spaces = false;
            } else if (
-                tokenizer_pre == "qwen2" ||
-                tokenizer_pre == "megrez") {
+                tokenizer_pre == "qwen2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
                vocab.tokenizer_clean_spaces = false;
            } else if (
@ -6665,6 +6664,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "minerva-7b") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
+            } else if (
+                tokenizer_pre == "megrez") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }