Use llama vocab
This commit is contained in:
parent
bf01b18eaa
commit
5d31c23f5e
3 changed files with 1 additions and 11 deletions
|
@ -104,7 +104,6 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||||
LLAMA_VOCAB_PRE_TYPE_FALCON3 = 27,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
|
|
|
@ -412,15 +412,6 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
"[0-9][0-9][0-9]",
|
"[0-9][0-9][0-9]",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_FALCON3:
|
|
||||||
regex_exprs = {
|
|
||||||
// original regex from tokenizer.json
|
|
||||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
||||||
|
|
||||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||||
|
|
|
@ -6228,7 +6228,7 @@ static void llm_load_vocab(
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "falcon3") {
|
tokenizer_pre == "falcon3") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON3;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||||
vocab.tokenizer_ignore_merges = true;
|
vocab.tokenizer_ignore_merges = true;
|
||||||
vocab.tokenizer_add_bos = true;
|
vocab.tokenizer_add_bos = true;
|
||||||
} else if (
|
} else if (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue