From 5d31c23f5e0d39292456b19c54c50bc543e74615 Mon Sep 17 00:00:00 2001 From: Billel Mokeddem Date: Tue, 10 Dec 2024 17:24:57 +0000 Subject: [PATCH] Use llama vocab --- include/llama.h | 1 - src/llama-vocab.cpp | 9 --------- src/llama.cpp | 2 +- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/llama.h b/include/llama.h index 72dfac906..ccb48f73c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -104,7 +104,6 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, - LLAMA_VOCAB_PRE_TYPE_FALCON3 = 27, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ea0d4adb9..d1dc96276 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -412,15 +412,6 @@ struct llm_tokenizer_bpe : llm_tokenizer { "[0-9][0-9][0-9]", }; break; - case LLAMA_VOCAB_PRE_TYPE_FALCON3: - regex_exprs = { - // original regex from tokenizer.json - //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - - // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 - "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", - }; - break; case LLAMA_VOCAB_PRE_TYPE_STARCODER: case LLAMA_VOCAB_PRE_TYPE_REFACT: case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: diff --git a/src/llama.cpp b/src/llama.cpp index 67b1f3505..0e2b15cb6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6228,7 +6228,7 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; } else if ( tokenizer_pre == "falcon3") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON3; + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; vocab.tokenizer_ignore_merges = true; vocab.tokenizer_add_bos = true; } else if (