Add Falcon3 support
This commit is contained in:
parent
54ef9cfc72
commit
bf01b18eaa
5 changed files with 22 additions and 0 deletions
|
@ -525,6 +525,9 @@ class Model:
|
|||
else:
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
# We need to manually encode and decode the added tokens in case special characters
|
||||
# used for `\n` / `\t` have been manually added in the added tokens
|
||||
token = tokenizer.decode(tokenizer.encode(token))
|
||||
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
|
@ -571,6 +574,9 @@ class Model:
|
|||
if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
|
||||
# ref: https://huggingface.co/tiiuae/falcon-7b
|
||||
res = "falcon"
|
||||
if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
|
||||
# ref: https://huggingface.co/tiiuae/falcon3-7b
|
||||
res = "falcon3"
|
||||
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
||||
res = "bert-bge"
|
||||
|
|
|
@ -71,6 +71,7 @@ models = [
|
|||
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon3-7b", },
|
||||
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
|
||||
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||
|
|
|
@ -104,6 +104,7 @@ extern "C" {
|
|||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON3 = 27,
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
|
|
|
@ -412,6 +412,15 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"[0-9][0-9][0-9]",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_FALCON3:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||
|
|
|
@ -6226,6 +6226,11 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "falcon") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||
} else if (
|
||||
tokenizer_pre == "falcon3") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON3;
|
||||
vocab.tokenizer_ignore_merges = true;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
} else if (
|
||||
tokenizer_pre == "mpt") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue