llama : add support for SmolLm pre-tokenizer (#8609)

* Adding SmolLM Pre Tokenizer

* Update convert_hf_to_gguf_update.py

Co-authored-by: compilade <git@compilade.net>

* Update src/llama.cpp

Co-authored-by: compilade <git@compilade.net>

* handle regex

* removed .inp and out .out ggufs

---------

Co-authored-by: compilade <git@compilade.net>
This commit is contained in:
Jason Stillerman 2024-07-22 10:43:01 -04:00 committed by GitHub
parent 566daa5a5b
commit d94c6e0ccb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 10 additions and 0 deletions

View file

@ -5521,6 +5521,10 @@ static void llm_load_vocab(
vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_ignore_merges = true;
vocab.tokenizer_add_bos = true;
} else if (
tokenizer_pre == "smollm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
vocab.tokenizer_clean_spaces = false;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -15543,6 +15547,7 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
regex_exprs = {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",