Use llama vocab

2024-12-10 17:24:57 +00:00 · 2024-12-10 17:24:57 +00:00 · 5d31c23f5e
commit 5d31c23f5e
parent bf01b18eaa
3 changed files with 1 additions and 11 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -104,7 +104,6 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
-        LLAMA_VOCAB_PRE_TYPE_FALCON3        = 27,
    };

    enum llama_rope_type {
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -412,15 +412,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "[0-9][0-9][0-9]",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_FALCON3:
-                regex_exprs = {
-                    // original regex from tokenizer.json
-                    //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-
-                    // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
-                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
            case LLAMA_VOCAB_PRE_TYPE_REFACT:
            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -6228,7 +6228,7 @@ static void llm_load_vocab(
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
            } else if (
                    tokenizer_pre == "falcon3") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON3;
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                vocab.tokenizer_ignore_merges = true;
                vocab.tokenizer_add_bos = true;
            } else if (