Use llama vocab

This commit is contained in:
Billel Mokeddem 2024-12-10 17:24:57 +00:00
parent bf01b18eaa
commit 5d31c23f5e
3 changed files with 1 additions and 11 deletions

View file

@ -104,7 +104,6 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
LLAMA_VOCAB_PRE_TYPE_FALCON3 = 27,
};
enum llama_rope_type {

View file

@ -412,15 +412,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"[0-9][0-9][0-9]",
};
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON3:
regex_exprs = {
// original regex from tokenizer.json
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:

View file

@ -6228,7 +6228,7 @@ static void llm_load_vocab(
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
} else if (
tokenizer_pre == "falcon3") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON3;
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
vocab.tokenizer_ignore_merges = true;
vocab.tokenizer_add_bos = true;
} else if (