fix punctuation regex in chameleon pre-tokenizer (@compilade)

Co-authored-by: compilade <git@compilade.net>
This commit is contained in:
nopperl 2024-07-22 11:47:35 +00:00 committed by GitHub
parent 1e1e78a324
commit 0ee896d149
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -15852,7 +15852,7 @@ struct llm_tokenizer_bpe {
"(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
"([\\t\\n]| | )", // directly from tokenizer.json "([\\t\\n]| | )", // directly from tokenizer.json
"\\p{N}", // Individual digits "\\p{N}", // Individual digits
"[\\p{P}\\$\\+<=>\\^~\\|`]+", // Punctuation "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
}; };
break; break;