Merge branch 'feat-jina-embeddings-v2-zh' of https://github.com/JoanFM/llama.cpp into feat-jina-embeddings-v2-zh

This commit is contained in:
Joan Martinez 2024-07-08 14:11:02 +02:00
commit 0699a4ce1d

View file

@ -15468,15 +15468,6 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
regex_exprs = {"\\w+|[^\\w\\s]+"};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
};
break;
}
}