llama : adapt punctuation regex + add llama 3 regex

2024-04-27 11:06:08 +03:00 · 2024-04-27 11:06:08 +03:00 · ad929833cb
commit ad929833cb
parent 96965f67e6
2 changed files with 14 additions and 10 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -12031,15 +12031,18 @@ struct llm_tokenizer_bpe {
                switch (vocab.type_pre) {
                    case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
                        word_collection = unicode_regex_split(text, {
-                            // TODO: ??????????????
+                            // original regex from tokenizer.json
                            //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",

+                            // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+
                            // TODO: this is not the same as the original regex:
                            //       - need to use ReFlex and update unicode.cpp to support the regex above
                            //       - or implement a custom function similar to unicode_gpt2_regex_preprocess()
-                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                            "\\p{N}+",
-                            "[0-9][0-9][0-9]"
+                            //"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                            //"\\p{N}+",
+                            //"[0-9][0-9][0-9]"
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
@ -12064,7 +12067,7 @@ struct llm_tokenizer_bpe {
                    default:
                        // default regex for BPE tokenization pre-processing
                        word_collection = unicode_regex_split(text, {
-                            "\\p{P}+",
+                            "[\\p{P}\\$\\+<=>\\^~\\|]+",
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                            "\\p{N}+",
                            "[0-9][0-9][0-9]"
--- a/unicode-data.cpp
+++ b/unicode-data.cpp