From 43e12ce8e582cd81857bfdeaeaef7bbcd1a456f0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 26 Apr 2024 20:08:28 +0300 Subject: [PATCH] llama : use new pre-tokenizer type --- convert-hf-to-gguf.py | 3 + llama.cpp | 103 ++++++++++++++++---------- llama.h | 8 ++ models/ggml-vocab-deepseek-coder.gguf | Bin 1157268 -> 1157113 bytes models/ggml-vocab-deepseek-llm.gguf | Bin 3970116 -> 3970627 bytes models/ggml-vocab-llama.gguf | Bin 723676 -> 724549 bytes tests/CMakeLists.txt | 7 +- tests/test-tokenizer-0-bpe.py | 2 + tests/test-tokenizer-0-falcon.cpp | 2 + tests/test-tokenizer-0-llama-v3.cpp | 2 + tests/test-tokenizer-0-llama.cpp | 2 + tests/test-tokenizer-0-spm.py | 2 + 12 files changed, 87 insertions(+), 44 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1aab4d2fe..28b060ed3 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -398,6 +398,9 @@ class Model(ABC): if chkhsh == -3290901550109860290: # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json res = "llama3" + if chkhsh == 5332289095291046364: + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat/blob/main/tokenizer.json + res = "deepseek-llm" if chkhsh == 4190561703949727616: # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json res = "deepseek-coder" diff --git a/llama.cpp b/llama.cpp index 09d8a0dd8..e05d10cdb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -316,6 +316,7 @@ enum llm_kv { LLM_KV_SSM_TIME_STEP_RANK, LLM_KV_TOKENIZER_MODEL, + LLM_KV_TOKENIZER_PRE, LLM_KV_TOKENIZER_LIST, LLM_KV_TOKENIZER_TOKEN_TYPE, LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, @@ -392,6 +393,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, @@ -2114,8 +2116,8 @@ struct llama_vocab { ttype type; }; - enum llm_arch arch = LLM_ARCH_UNKNOWN; - enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; std::unordered_map token_to_id; std::vector id_to_token; @@ -4166,11 +4168,13 @@ static void llm_load_vocab( // determine vocab type { - std::string tokenizer_name; + std::string tokenizer_model; + std::string tokenizer_pre; - ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); + ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); + ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - if (tokenizer_name == "no_vocab") { + if (tokenizer_model == "no_vocab") { vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens @@ -4184,7 +4188,7 @@ static void llm_load_vocab( vocab.linefeed_id = -1; return; - } else if (tokenizer_name == "llama") { + } else if (tokenizer_model == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens @@ -4229,7 +4233,7 @@ static void llm_load_vocab( if (add_space_prefix_keyidx != -1) { vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // The default value of add_space_prefix is true. - } else if (tokenizer_name == "bert") { + } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; // default special tokens @@ -4242,10 +4246,10 @@ static void llm_load_vocab( vocab.special_mask_id = 103; vocab.add_space_prefix = false; } else { - if (tokenizer_name == "gpt2") { + if (tokenizer_model == "gpt2") { vocab.type = LLAMA_VOCAB_TYPE_BPE; } else { - LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); + LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str()); LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); vocab.type = LLAMA_VOCAB_TYPE_SPM; return; @@ -4285,7 +4289,20 @@ static void llm_load_vocab( vocab.special_mask_id = -1; } - vocab.arch = model.arch; + if (tokenizer_pre.empty()) { + LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'", __func__); + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } else if (tokenizer_pre == "default") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } else if (tokenizer_pre == "llama3") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; + } else if (tokenizer_pre == "deepseek-llm") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; + } else if (tokenizer_pre == "deepseek-coder") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; + } else { + throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + } } const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); @@ -12011,38 +12028,44 @@ struct llm_tokenizer_bpe { std::vector word_collection; switch (vocab.type) { case LLAMA_VOCAB_TYPE_BPE: - switch (vocab.arch) { - // TODO: how to detect deepseek and llama v3 models? - //case LLM_ARCH_LLAMA: - //case LLM_ARCH_DEEPSEEK_CODER: - // word_collection = unicode_regex_split(text, { - // "[\r\n]", - // "\\s?\\p{L}+", - // "\\s?\\p{P}+", - // "[一-龥ࠀ-一가-퟿]+", - // "\\p{N}+" - // }); - // break; - //case LLM_ARCH_DEEPSEEK_LLM: - // word_collection = unicode_regex_split(text, { - // "[\r\n]", - // "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", - // "\\s?[!-/:-~!-/:-~‘-‟ -。]+", - // "\\s+$", - // "[一-龥ࠀ-一가-퟿]+", - // "\\p{N}+" - // }); - // break; + switch (vocab.type_pre) { + case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + word_collection = unicode_regex_split(text, { + // TODO: ?????????????? + //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+ + "\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]" + }); + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: + word_collection = unicode_regex_split(text, { + "[\r\n]", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[!-/:-~!-/:-~‘-‟ -。]+", + "\\s+$", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+" + }); + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: + word_collection = unicode_regex_split(text, { + "[\r\n]", + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+" + }); + break; default: // default regex for BPE tokenization pre-processing - { - word_collection = unicode_regex_split(text, { - "\\p{P}+", - "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", - "\\p{N}+", - "[0-9][0-9][0-9]" - }); - } + word_collection = unicode_regex_split(text, { + "\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]" + }); break; } break; diff --git a/llama.h b/llama.h index 8aa763672..9c89d72af 100644 --- a/llama.h +++ b/llama.h @@ -69,6 +69,14 @@ extern "C" { LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece }; + // pre-tokenization types + enum llama_vocab_pre_type { + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, + }; + // note: these values should be synchronized with ggml_rope // TODO: maybe move this enum to ggml.h (ggml_rope_type) enum llama_rope_type { diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf index 640ee63d8742a9db73cd66673e74c7a6978870e9..8ea17fa4d988de61d2db3941d3a4870fe5f3303a 100644 GIT binary patch delta 220 zcmbPo(e3AXH$ivzP&Z}<1dyC4n9OXZXFjo9ol#-pd@n|UiLcaI8MGM~q$W>f6rcQ` zk&jgYs9YAx;M1& delta 232 zcmex)-fhZ7H$ivzP&Z}<1dy31n9OXbXFRc7ol#=qd@n|hiLcaI8$=lxq$W>f6rZfi z$j2%GR4xnUadE_Fm$3i^1SV%P)-l2rF$pkCW@XZ0gsbC*sROE=Jb}?{at@=)WJ9JZ zF+Ql}DXFOi#i^;;@yYoqsYT7(ncBBAF#<7&W(HywAZ7((HXvpPVh$kY1Y#~A<_2OO VAm#;PJ|N}?Vu9`3nFO7d0swdFJ1YPH diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf index 8fed82fa0ba1b9fc1fad26392d56831ffaf21817..1e087220f2a7150c913e5b58e1cd28fd8a6e31cb 100644 GIT binary patch delta 933 zcmaLU&2G~`5C`yVTA&RSnvyn93M{2zf#4Jhv``v}Ug(8uPN=dr_BdE|Y|FbKg{q1W z2d+d$;uSb>=LQG3@f5s5#RKs7M~J8hEB)k;of(g3zkfVr-##6(^3G1{$wXnas)O>= zrAq#66NQRs&+xpivC%fH=TZ+|zh{f_S5}Q%Y%U%!BmTu!;xBCdaXHWGkWZ2H8jkCc zse2(!VuQhK9^iShD;qBZOSVnrzDoBC?^x`xI;(u}b3dw^&6}B z)+#bct`jf`$KW`efGL=U5}brnFaxLI49r3q=HM(;U>+8r3g_TFEW!m?f{SnoE(3=v zunbpW1!`~&1YCz3a1-iq3k+z$D%^%Uum*SG9^8j@*nkJ{5H{h_XREmNbI`A5V<_lg zk9>?PY9!}iYU+lb)HH?dAIW&J_G{dAxF2XPZQINaLc4))hEdQBHFp)a$#H$MdF=qf zipE7Yxg$MA2eIR2*sni$Qm&hAaPAcUpEs>Ge+#bI8PhtA|H^vrM my`I9P^U{C*=17mj7KdX>CzgyRoih6V_i>~H6{B{Kl=%&%$rjuI delta 428 zcmYMs%`O9B6vlCHt6KU&X$|^8sj8x8#zz$^3*B^UVKteUS2fv-Y7J38xB~6^bQmF&gq`SftV>NXn9o5rPmAuab# zwoSvRnib2`b^WAw|5Qv3b4uAQyWDl9)wVmfQc_tR={B9Nt%5{^7ZoM%N0dayi;0|f zF2bU~qp3DX68)DzPD|f9QiV8(4F5V(wp}T&Eu~uhXA%#}_o#dVf8CE!1Tcmm zLKw#c!iXS>7$z}=IHoa!1d^DAhB?e*0V$-BK^BWxLJrGVK_06pU=2mAql67?Vhd$# aLx%wqJJ`h@_EEtBsyM{ahw>eN_kIC?pmE6n diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama.gguf index 549eed8c53f438a61f1b00c9bd3b7d02325f2479..568ffdc16fe655ed515045177de6687bb613e5df 100644 GIT binary patch delta 916 zcmcaJRp;m&9YJ^ZP&Z}<1dyI6=%XkJgbHjso$iRa`dMl*Xs5C;g-Q}a@b5_9y@GILVnODYReK_)Q*u>eRG z2p}ev`I^kkES)+1c$Sh6ML?0VQ@mCV|@Ric0K)EbWGB z?As01IKIAj)X2^|hYM$O$K54_CBd;d-ByfKh*6+DK#X&HfEXA1m+ARZoDS2&xwu;y zWxy(g6H`*+lk$t>OY*Z*^FR@RERl*V!5|0LApuN2X_*xe4e^;Npp?j7%K(!tE=@z1 h<$%kI=4Pg(fKo8XGN@k8+UX5k+|oiND56|IJpdPZ=LY}) delta 221 zcmX>)N9WE|9YJ^ZP&Z}<1Q47k=p(=Z<@@*~`Z_9<8BLt8$S5%JpenNf1H;5RStbRB ziO)3{wI?$%=}Z=4RAW^Ds+XPI$0$Boi;-(`93wjmNMv#^qZy;f^ohLOVv}8^( z%=t`wj10}|nA+DdF#<7&W(HywAZ7((w(aYf*x$)RRr41o=NF|Ga{`TH17e1DMiut$ kj4B-LUl=99#!Zix;1ptHXfKlB++HNX#r|dbP9E-70I8@l!T> & k_tests() { { " " , { 466, }, }, { "\t" , { 192, }, }, { "\n" , { 193, }, }, + { "\n\n" , { 1001, }, }, + { "\n\n\n" , { 11331, }, }, { "\t\n" , { 19125, }, }, { "Hello world" , { 9856, 1079, }, }, { " Hello world" , { 23090, 1079, }, }, diff --git a/tests/test-tokenizer-0-llama-v3.cpp b/tests/test-tokenizer-0-llama-v3.cpp index a0ecf6283..2e91b717f 100644 --- a/tests/test-tokenizer-0-llama-v3.cpp +++ b/tests/test-tokenizer-0-llama-v3.cpp @@ -17,6 +17,8 @@ static const std::map> & k_tests() { { " " , { 262, }, }, { "\t" , { 197, }, }, { "\n" , { 198, }, }, + { "\n\n" , { 271, }, }, + { "\n\n\n" , { 1432, }, }, { "\t\n" , { 1602, }, }, { "Hello world" , { 9906, 1917, }, }, { " Hello world" , { 22691, 1917, }, }, diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index fd407041b..f0634cfe5 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -17,6 +17,8 @@ static const std::map> & k_tests() { { " " , { 268, }, }, { "\t" , { 29871, 12, }, }, { "\n" , { 29871, 13, }, }, + { "\n\n" , { 29871, 13, 13, }, }, + { "\n\n\n" , { 29871, 13, 13, 13, }, }, { "\t\n" , { 29871, 12, 13, }, }, { "Hello world" , { 15043, 3186, }, }, { " Hello world" , { 29871, 15043, 3186, }, }, diff --git a/tests/test-tokenizer-0-spm.py b/tests/test-tokenizer-0-spm.py index f2d3b6e88..be12a6b93 100644 --- a/tests/test-tokenizer-0-spm.py +++ b/tests/test-tokenizer-0-spm.py @@ -27,6 +27,8 @@ tests = [ " ", "\t", "\n", + "\n\n", + "\n\n\n", "\t\n", "Hello world", " Hello world",