falcon : fix regex

This commit is contained in:
Georgi Gerganov 2024-05-02 11:52:50 +03:00
parent 3a461dbff3
commit 3275e60f57
No known key found for this signature in database
GPG key ID: BF970631944C16B7

View file

@ -12212,14 +12212,13 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+", "\\s?\\p{L}+",
"\\s?\\p{P}+", "\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+", "[一-龥ࠀ-一가-퟿]+",
"\\p{N}+", "\\p{N}",
}); });
break; break;
case LLAMA_VOCAB_PRE_TYPE_FALCON: case LLAMA_VOCAB_PRE_TYPE_FALCON:
word_collection = unicode_regex_split(text, { word_collection = unicode_regex_split(text, {
"[\\p{P}\\$\\+<=>\\^~\\|]+", "[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]", "[0-9][0-9][0-9]",
}); });
break; break;