llama : move codeshell after smollm below to respect the enum order

2024-07-23 00:40:21 +08:00 · 2024-07-23 00:40:21 +08:00 · a2ae810a38
commit a2ae810a38
parent f15ea2c928
1 changed files with 4 additions and 4 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5515,9 +5515,6 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "jais") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
            } else if (
                tokenizer_pre == "codeshell") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
            } else if (
                tokenizer_pre == "tekken") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
@ -5528,6 +5525,9 @@ static void llm_load_vocab(
                tokenizer_pre == "smollm") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
                vocab.tokenizer_clean_spaces = false;
            } else if (
                tokenizer_pre == "codeshell") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -15550,8 +15550,8 @@ struct llm_tokenizer_bpe {
            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
            case LLAMA_VOCAB_PRE_TYPE_REFACT:
            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
                regex_exprs = {
                    "\\p{N}",
                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",