Merge branch 'feat-jina-embeddings-v2-zh' of https://github.com/JoanFM/llama.cpp into feat-jina-embeddings-v2-zh
This commit is contained in:
commit
0699a4ce1d
1 changed files with 0 additions and 9 deletions
|
@ -15468,15 +15468,6 @@ struct llm_tokenizer_bpe {
|
||||||
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
|
||||||
regex_exprs = {"\\w+|[^\\w\\s]+"};
|
regex_exprs = {"\\w+|[^\\w\\s]+"};
|
||||||
break;
|
break;
|
||||||
default:
|
|
||||||
// default regex for BPE tokenization pre-processing
|
|
||||||
regex_exprs = {
|
|
||||||
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
||||||
"\\p{N}+",
|
|
||||||
"[0-9][0-9][0-9]",
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue