Fix falcon punctuation regex

This commit is contained in:
jaime-m-p 2024-05-25 04:32:45 +02:00
parent 0794b77714
commit 51e933a962

View file

@ -12315,7 +12315,7 @@ struct llm_tokenizer_bpe {
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON:
regex_exprs = {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"[0-9][0-9][0-9]",
};