Add StableLM2 pre-tokenizer (#7349)

* Add StableLM pre-tokenizer

* Fix space

* Fix trailing whitespace
This commit is contained in:
Anas Ahouzi 2024-05-19 14:46:46 +02:00 committed by GitHub
parent ab33f7a338
commit 6aade19ee7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 12 additions and 3 deletions

View file

@ -4463,6 +4463,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "qwen2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else if (
tokenizer_pre == "stablelm2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
} else if (
tokenizer_pre == "olmo") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@ -12363,6 +12366,7 @@ struct llm_tokenizer_bpe {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
word_collection = unicode_regex_split(text, {
// original regex from tokenizer.json