Add StableLM2 pre-tokenizer (#7349)

* Add StableLM pre-tokenizer * Fix space * Fix trailing whitespace
2024-05-19 14:46:46 +02:00 · 2024-05-19 14:46:46 +02:00 · 6aade19ee7
commit 6aade19ee7
parent ab33f7a338
4 changed files with 12 additions and 3 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -4463,6 +4463,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "qwen2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+            } else if (
+                tokenizer_pre == "stablelm2") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
            } else if (
                tokenizer_pre == "olmo") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@ -12363,6 +12366,7 @@ struct llm_tokenizer_bpe {
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                        });
                        break;
+                    case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
                    case LLAMA_VOCAB_PRE_TYPE_QWEN2:
                        word_collection = unicode_regex_split(text, {
                            // original regex from tokenizer.json