llama : adapt punctuation regex + add llama 3 regex
This commit is contained in:
parent
96965f67e6
commit
ad929833cb
2 changed files with 14 additions and 10 deletions
13
llama.cpp
13
llama.cpp
|
@ -12031,15 +12031,18 @@ struct llm_tokenizer_bpe {
|
||||||
switch (vocab.type_pre) {
|
switch (vocab.type_pre) {
|
||||||
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
// TODO: ??????????????
|
// original regex from tokenizer.json
|
||||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
|
||||||
|
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||||
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||||
|
|
||||||
// TODO: this is not the same as the original regex:
|
// TODO: this is not the same as the original regex:
|
||||||
// - need to use ReFlex and update unicode.cpp to support the regex above
|
// - need to use ReFlex and update unicode.cpp to support the regex above
|
||||||
// - or implement a custom function similar to unicode_gpt2_regex_preprocess()
|
// - or implement a custom function similar to unicode_gpt2_regex_preprocess()
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
//"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
"\\p{N}+",
|
//"\\p{N}+",
|
||||||
"[0-9][0-9][0-9]"
|
//"[0-9][0-9][0-9]"
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||||
|
@ -12064,7 +12067,7 @@ struct llm_tokenizer_bpe {
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
"\\p{P}+",
|
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
"\\p{N}+",
|
"\\p{N}+",
|
||||||
"[0-9][0-9][0-9]"
|
"[0-9][0-9][0-9]"
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue