llama : adapt punctuation regex + add llama 3 regex

This commit is contained in:
Georgi Gerganov 2024-04-27 11:06:08 +03:00
parent 96965f67e6
commit ad929833cb
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 14 additions and 10 deletions

View file

@ -12031,15 +12031,18 @@ struct llm_tokenizer_bpe {
switch (vocab.type_pre) { switch (vocab.type_pre) {
case LLAMA_VOCAB_PRE_TYPE_LLAMA3: case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
word_collection = unicode_regex_split(text, { word_collection = unicode_regex_split(text, {
// TODO: ?????????????? // original regex from tokenizer.json
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
// TODO: this is not the same as the original regex: // TODO: this is not the same as the original regex:
// - need to use ReFlex and update unicode.cpp to support the regex above // - need to use ReFlex and update unicode.cpp to support the regex above
// - or implement a custom function similar to unicode_gpt2_regex_preprocess() // - or implement a custom function similar to unicode_gpt2_regex_preprocess()
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", //"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+", //"\\p{N}+",
"[0-9][0-9][0-9]" //"[0-9][0-9][0-9]"
}); });
break; break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
@ -12064,7 +12067,7 @@ struct llm_tokenizer_bpe {
default: default:
// default regex for BPE tokenization pre-processing // default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, { word_collection = unicode_regex_split(text, {
"\\p{P}+", "[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+", "\\p{N}+",
"[0-9][0-9][0-9]" "[0-9][0-9][0-9]"

File diff suppressed because one or more lines are too long