diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1aab4d2fe..28b060ed3 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -398,6 +398,9 @@ class Model(ABC): if chkhsh == -3290901550109860290: # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json res = "llama3" + if chkhsh == 5332289095291046364: + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat/blob/main/tokenizer.json + res = "deepseek-llm" if chkhsh == 4190561703949727616: # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json res = "deepseek-coder" diff --git a/llama.cpp b/llama.cpp index 09d8a0dd8..e05d10cdb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -316,6 +316,7 @@ enum llm_kv { LLM_KV_SSM_TIME_STEP_RANK, LLM_KV_TOKENIZER_MODEL, + LLM_KV_TOKENIZER_PRE, LLM_KV_TOKENIZER_LIST, LLM_KV_TOKENIZER_TOKEN_TYPE, LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, @@ -392,6 +393,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, @@ -2114,8 +2116,8 @@ struct llama_vocab { ttype type; }; - enum llm_arch arch = LLM_ARCH_UNKNOWN; - enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; std::unordered_map token_to_id; std::vector id_to_token; @@ -4166,11 +4168,13 @@ static void llm_load_vocab( // determine vocab type { - std::string tokenizer_name; + std::string tokenizer_model; + std::string tokenizer_pre; - ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); + ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); + ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - if (tokenizer_name == "no_vocab") { + if (tokenizer_model == "no_vocab") { vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens @@ -4184,7 +4188,7 @@ static void llm_load_vocab( vocab.linefeed_id = -1; return; - } else if (tokenizer_name == "llama") { + } else if (tokenizer_model == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens @@ -4229,7 +4233,7 @@ static void llm_load_vocab( if (add_space_prefix_keyidx != -1) { vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // The default value of add_space_prefix is true. - } else if (tokenizer_name == "bert") { + } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; // default special tokens @@ -4242,10 +4246,10 @@ static void llm_load_vocab( vocab.special_mask_id = 103; vocab.add_space_prefix = false; } else { - if (tokenizer_name == "gpt2") { + if (tokenizer_model == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; } else { - LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); + LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str()); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); vocab.type = LLAMA_VOCAB_TYPE_SPM; return; @@ -4285,7 +4289,20 @@ static void llm_load_vocab( vocab.special_mask_id = -1; } - vocab.arch = model.arch; + if (tokenizer_pre.empty()) { + LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'", __func__); + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } else if (tokenizer_pre == "default") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } else if (tokenizer_pre == "llama3") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; + } else if (tokenizer_pre == "deepseek-llm") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; + } else if (tokenizer_pre == "deepseek-coder") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; + } else { + throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + } } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); @@ -12011,38 +12028,44 @@ struct llm_tokenizer_bpe { std::vector word_collection; switch (vocab.type) { case LLAMA_VOCAB_TYPE_BPE: - switch (vocab.arch) { - // TODO: how to detect deepseek and llama v3 models? - //case LLM_ARCH_LLAMA: - //case LLM_ARCH_DEEPSEEK_CODER: - // word_collection = unicode_regex_split(text, { - // "[\r\n]", - // "\\s?\\p{L}+", - // "\\s?\\p{P}+", - // "[一-龥ࠀ-一가-퟿]+", - // "\\p{N}+" - // }); - // break; - //case LLM_ARCH_DEEPSEEK_LLM: - // word_collection = unicode_regex_split(text, { - // "[\r\n]", - // "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", - // "\\s?[!-/:-~!-/:-~‘-‟ -。]+", - // "\\s+$", - // "[一-龥ࠀ-一가-퟿]+", - // "\\p{N}+" - // }); - // break; + switch (vocab.type_pre) { + case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + word_collection = unicode_regex_split(text, { + // TODO: ?????????????? + //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+ + "\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]" + }); + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: + word_collection = unicode_regex_split(text, { + "[\r\n]", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[!-/:-~!-/:-~‘-‟ -。]+", + "\\s+$", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+" + }); + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: + word_collection = unicode_regex_split(text, { + "[\r\n]", + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+" + }); + break; default: // default regex for BPE tokenization pre-processing - { - word_collection = unicode_regex_split(text, { - "\\p{P}+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", - "[0-9][0-9][0-9]" - }); - } + word_collection = unicode_regex_split(text, { + "\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]" + }); break; } break; diff --git a/llama.h b/llama.h index 8aa763672..9c89d72af 100644 --- a/llama.h +++ b/llama.h @@ -69,6 +69,14 @@ extern "C" { LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece }; + // pre-tokenization types + enum llama_vocab_pre_type { + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, + }; + // note: these values should be synchronized with ggml_rope // TODO: maybe move this enum to ggml.h (ggml_rope_type) enum llama_rope_type { diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf index 640ee63d8..8ea17fa4d 100644 Binary files a/models/ggml-vocab-deepseek-coder.gguf and b/models/ggml-vocab-deepseek-coder.gguf differ diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf index 8fed82fa0..1e087220f 100644 Binary files a/models/ggml-vocab-deepseek-llm.gguf and b/models/ggml-vocab-deepseek-llm.gguf differ diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama.gguf index 549eed8c5..568ffdc16 100644 Binary files a/models/ggml-vocab-llama.gguf and b/models/ggml-vocab-llama.gguf differ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4f0889007..3acf28ba4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -41,13 +41,12 @@ llama_test(test-quantize-perf.cpp) llama_test(test-sampling.cpp) llama_test(test-chat-template.cpp) -# TODO: tmp disabled LLaMA v3 and Deepseek tests llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) -#llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf) +llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf) llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -#llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) -#llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) +llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) +llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) diff --git a/tests/test-tokenizer-0-bpe.py b/tests/test-tokenizer-0-bpe.py index d412ce039..38aa33c46 100644 --- a/tests/test-tokenizer-0-bpe.py +++ b/tests/test-tokenizer-0-bpe.py @@ -27,6 +27,8 @@ tests = [ " ", "\t", "\n", + "\n\n", + "\n\n\n", "\t\n", "Hello world", " Hello world", diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 3e8877563..67f675a62 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -17,6 +17,8 @@ static const std::map> & k_tests() { { " " , { 466, }, }, { "\t" , { 192, }, }, { "\n" , { 193, }, }, + { "\n\n" , { 1001, }, }, + { "\n\n\n" , { 11331, }, }, { "\t\n" , { 19125, }, }, { "Hello world" , { 9856, 1079, }, }, { " Hello world" , { 23090, 1079, }, }, diff --git a/tests/test-tokenizer-0-llama-v3.cpp b/tests/test-tokenizer-0-llama-v3.cpp index a0ecf6283..2e91b717f 100644 --- a/tests/test-tokenizer-0-llama-v3.cpp +++ b/tests/test-tokenizer-0-llama-v3.cpp @@ -17,6 +17,8 @@ static const std::map> & k_tests() { { " " , { 262, }, }, { "\t" , { 197, }, }, { "\n" , { 198, }, }, + { "\n\n" , { 271, }, }, + { "\n\n\n" , { 1432, }, }, { "\t\n" , { 1602, }, }, { "Hello world" , { 9906, 1917, }, }, { " Hello world" , { 22691, 1917, }, }, diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index fd407041b..f0634cfe5 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -17,6 +17,8 @@ static const std::map> & k_tests() { { " " , { 268, }, }, { "\t" , { 29871, 12, }, }, { "\n" , { 29871, 13, }, }, + { "\n\n" , { 29871, 13, 13, }, }, + { "\n\n\n" , { 29871, 13, 13, 13, }, }, { "\t\n" , { 29871, 12, 13, }, }, { "Hello world" , { 15043, 3186, }, }, { " Hello world" , { 29871, 15043, 3186, }, }, diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py index f2d3b6e88..be12a6b93 100644 --- a/tests/test-tokenizer-0-spm.py +++ b/tests/test-tokenizer-0-spm.py @@ -27,6 +27,8 @@ tests = [ " ", "\t", "\n", + "\n\n", + "\n\n\n", "\t\n", "Hello world", " Hello world",