llama : use new pre-tokenizer type

2024-04-26 20:08:28 +03:00 · 2024-04-26 20:08:28 +03:00 · 43e12ce8e5
commit 43e12ce8e5
parent 9b4d63ae53
12 changed files with 87 additions and 44 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -398,6 +398,9 @@ class Model(ABC):
            if chkhsh == -3290901550109860290:
                # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json
                res = "llama3"
+            if chkhsh ==  5332289095291046364:
+                # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat/blob/main/tokenizer.json
+                res = "deepseek-llm"
            if chkhsh ==  4190561703949727616:
                # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json
                res = "deepseek-coder"
--- a/llama.cpp
+++ b/llama.cpp
@ -316,6 +316,7 @@ enum llm_kv {
    LLM_KV_SSM_TIME_STEP_RANK,

    LLM_KV_TOKENIZER_MODEL,
+    LLM_KV_TOKENIZER_PRE,
    LLM_KV_TOKENIZER_LIST,
    LLM_KV_TOKENIZER_TOKEN_TYPE,
    LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@ -392,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },

    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
+    { LLM_KV_TOKENIZER_PRE,                 "tokenizer.ggml.pre"                },
    { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
    { LLM_KV_TOKENIZER_TOKEN_TYPE,          "tokenizer.ggml.token_type"         },
    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,    "tokenizer.ggml.token_type_count"   },
@ -2114,8 +2116,8 @@ struct llama_vocab {
        ttype type;
    };

-    enum llm_arch         arch = LLM_ARCH_UNKNOWN;
-    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
+    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
@ -4166,11 +4168,13 @@ static void llm_load_vocab(

    // determine vocab type
    {
-        std::string tokenizer_name;
+        std::string tokenizer_model;
+        std::string tokenizer_pre;

-        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
+        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
+        ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);

-        if (tokenizer_name == "no_vocab") {
+        if (tokenizer_model == "no_vocab") {
            vocab.type = LLAMA_VOCAB_TYPE_NONE;

            // default special tokens
@ -4184,7 +4188,7 @@ static void llm_load_vocab(
            vocab.linefeed_id     = -1;

            return;
-        } else if (tokenizer_name == "llama") {
+        } else if (tokenizer_model == "llama") {
            vocab.type = LLAMA_VOCAB_TYPE_SPM;

            // default special tokens
@ -4229,7 +4233,7 @@ static void llm_load_vocab(
            if (add_space_prefix_keyidx != -1) {
                vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
            } // The default value of add_space_prefix is true.
-        } else if (tokenizer_name == "bert") {
+        } else if (tokenizer_model == "bert") {
            vocab.type = LLAMA_VOCAB_TYPE_WPM;

            // default special tokens
@ -4242,10 +4246,10 @@ static void llm_load_vocab(
            vocab.special_mask_id = 103;
            vocab.add_space_prefix = false;
        } else {
-            if (tokenizer_name == "gpt2") {
+            if (tokenizer_model == "gpt2") {
                vocab.type = LLAMA_VOCAB_TYPE_BPE;
            } else {
-                LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
+                LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
                LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
                vocab.type = LLAMA_VOCAB_TYPE_SPM;
                return;
@ -4285,7 +4289,20 @@ static void llm_load_vocab(
            vocab.special_mask_id = -1;
        }

-        vocab.arch = model.arch;
+        if (tokenizer_pre.empty()) {
+            LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'", __func__);
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        } else if (tokenizer_pre == "default") {
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        } else if (tokenizer_pre == "llama3") {
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+        } else if (tokenizer_pre == "deepseek-llm") {
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+        } else if (tokenizer_pre == "deepseek-coder") {
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+        } else {
+            throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+        }
    }

    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@ -12011,38 +12028,44 @@ struct llm_tokenizer_bpe {
        std::vector<std::string> word_collection;
        switch (vocab.type) {
            case LLAMA_VOCAB_TYPE_BPE:
-                switch (vocab.arch) {
-                    // TODO: how to detect deepseek and llama v3 models?
-                    //case LLM_ARCH_LLAMA:
-                    //case LLM_ARCH_DEEPSEEK_CODER:
-                    //    word_collection = unicode_regex_split(text, {
-                    //        "[\r\n]",
-                    //        "\\s?\\p{L}+",
-                    //        "\\s?\\p{P}+",
-                    //        "[一-龥ࠀ-一가-퟿]+",
-                    //        "\\p{N}+"
-                    //    });
-                    //    break;
-                    //case LLM_ARCH_DEEPSEEK_LLM:
-                    //    word_collection = unicode_regex_split(text, {
-                    //        "[\r\n]",
-                    //        "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
-                    //        "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
-                    //        "\\s+$",
-                    //        "[一-龥ࠀ-一가-퟿]+",
-                    //        "\\p{N}+"
-                    //    });
-                    //    break;
+                switch (vocab.type_pre) {
+                    case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+                        word_collection = unicode_regex_split(text, {
+                            // TODO: ??????????????
+                            //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+
+                            "\\p{P}+",
+                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                            "\\p{N}+",
+                            "[0-9][0-9][0-9]"
+                        });
+                        break;
+                    case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
+                        word_collection = unicode_regex_split(text, {
+                            "[\r\n]",
+                            "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
+                            "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
+                            "\\s+$",
+                            "[一-龥ࠀ-一가-퟿]+",
+                            "\\p{N}+"
+                        });
+                        break;
+                    case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
+                        word_collection = unicode_regex_split(text, {
+                            "[\r\n]",
+                            "\\s?\\p{L}+",
+                            "\\s?\\p{P}+",
+                            "[一-龥ࠀ-一가-퟿]+",
+                            "\\p{N}+"
+                        });
+                        break;
                    default:
                        // default regex for BPE tokenization pre-processing
-                        {
-                            word_collection = unicode_regex_split(text, {
-                                "\\p{P}+",
-                                "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                                "\\p{N}+",
-                                "[0-9][0-9][0-9]"
-                            });
-                        }
+                        word_collection = unicode_regex_split(text, {
+                            "\\p{P}+",
+                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                            "\\p{N}+",
+                            "[0-9][0-9][0-9]"
+                        });
                        break;
                }
                break;
--- a/llama.h
+++ b/llama.h
@ -69,6 +69,14 @@ extern "C" {
        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
    };

+    // pre-tokenization types
+    enum llama_vocab_pre_type {
+        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+    };
+
    // note: these values should be synchronized with ggml_rope
    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
    enum llama_rope_type {
--- a/models/ggml-vocab-deepseek-coder.gguf
+++ b/models/ggml-vocab-deepseek-coder.gguf
--- a/models/ggml-vocab-deepseek-llm.gguf
+++ b/models/ggml-vocab-deepseek-llm.gguf
--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -41,13 +41,12 @@ llama_test(test-quantize-perf.cpp)
 llama_test(test-sampling.cpp)
 llama_test(test-chat-template.cpp)

-# TODO: tmp disabled LLaMA v3 and Deepseek tests
 llama_test(test-tokenizer-0-llama.cpp    NAME test-tokenizer-0-llama                          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-#llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3                       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
+llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3                       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
 llama_test(test-tokenizer-0-falcon.cpp   NAME test-tokenizer-0-falcon                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)

-#llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-#llama_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)

 llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-llama                            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-baichuan                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
--- a/tests/test-tokenizer-0-bpe.py
+++ b/tests/test-tokenizer-0-bpe.py
@ -27,6 +27,8 @@ tests = [
    "   ",
    "\t",
    "\n",
+    "\n\n",
+    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   "                   , {     466, }, },
        { "\t"                    , {     192, }, },
        { "\n"                    , {     193, }, },
+        { "\n\n"                  , {    1001, }, },
+        { "\n\n\n"                , {   11331, }, },
        { "\t\n"                  , {   19125, }, },
        { "Hello world"           , {    9856,   1079, }, },
        { " Hello world"          , {   23090,   1079, }, },
--- a/tests/test-tokenizer-0-llama-v3.cpp
+++ b/tests/test-tokenizer-0-llama-v3.cpp
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   "                   , {     262, }, },
        { "\t"                    , {     197, }, },
        { "\n"                    , {     198, }, },
+        { "\n\n"                  , {     271, }, },
+        { "\n\n\n"                , {    1432, }, },
        { "\t\n"                  , {    1602, }, },
        { "Hello world"           , {    9906,   1917, }, },
        { " Hello world"          , {   22691,   1917, }, },
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -17,6 +17,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   "                   , {     268, }, },
        { "\t"                    , {   29871,     12, }, },
        { "\n"                    , {   29871,     13, }, },
+        { "\n\n"                  , {   29871,     13,     13, }, },
+        { "\n\n\n"                , {   29871,     13,     13,     13, }, },
        { "\t\n"                  , {   29871,     12,     13, }, },
        { "Hello world"           , {   15043,   3186, }, },
        { " Hello world"          , {   29871,  15043,   3186, }, },
--- a/tests/test-tokenizer-0-spm.py
+++ b/tests/test-tokenizer-0-spm.py
@ -27,6 +27,8 @@ tests = [
    "   ",
    "\t",
    "\n",
+    "\n\n",
+    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",