Fix falcon punctuation regex

This commit is contained in:
jaime-m-p 2024-05-25 04:32:45 +02:00
parent 0794b77714
commit 51e933a962

View file

@ -12315,7 +12315,7 @@ struct llm_tokenizer_bpe {
break; break;
case LLAMA_VOCAB_PRE_TYPE_FALCON: case LLAMA_VOCAB_PRE_TYPE_FALCON:
regex_exprs = { regex_exprs = {
"[\\p{P}\\$\\+<=>\\^~\\|]+", "[\\p{P}\\$\\+<=>\\^~\\|`]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"[0-9][0-9][0-9]", "[0-9][0-9][0-9]",
}; };