llama : adapt punctuation regex + add llama 3 regex
This commit is contained in:
parent
96965f67e6
commit
ad929833cb
2 changed files with 14 additions and 10 deletions
13
llama.cpp
13
llama.cpp
|
@ -12031,15 +12031,18 @@ struct llm_tokenizer_bpe {
|
|||
switch (vocab.type_pre) {
|
||||
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// TODO: ??????????????
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
|
||||
// TODO: this is not the same as the original regex:
|
||||
// - need to use ReFlex and update unicode.cpp to support the regex above
|
||||
// - or implement a custom function similar to unicode_gpt2_regex_preprocess()
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
"\\p{N}+",
|
||||
"[0-9][0-9][0-9]"
|
||||
//"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
//"\\p{N}+",
|
||||
//"[0-9][0-9][0-9]"
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
||||
|
@ -12064,7 +12067,7 @@ struct llm_tokenizer_bpe {
|
|||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
word_collection = unicode_regex_split(text, {
|
||||
"\\p{P}+",
|
||||
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
"\\p{N}+",
|
||||
"[0-9][0-9][0-9]"
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue