convert : add falcon
ggml-ci
This commit is contained in:
parent
4e3e6d8ecc
commit
1c888eb4da
5 changed files with 29 additions and 12 deletions
|
@ -6,6 +6,7 @@
|
||||||
# python3 convert-hf-to-gguf-update.py <huggingface_token>
|
# python3 convert-hf-to-gguf-update.py <huggingface_token>
|
||||||
#
|
#
|
||||||
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
|
# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
|
||||||
|
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||||
#
|
#
|
||||||
# TODO: generate tokenizer tests for llama.cpp
|
# TODO: generate tokenizer tests for llama.cpp
|
||||||
#
|
#
|
||||||
|
@ -33,13 +34,14 @@ else:
|
||||||
print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# TODO: add models here
|
# TODO: add models here, base models preferred
|
||||||
models = [
|
models = [
|
||||||
{ "name": "llama-v2", "tokenizer_type": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
{ "name": "llama-v2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||||
{ "name": "llama-v3", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
{ "name": "llama-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||||
{ "name": "deepseek-llm", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat", },
|
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||||
{ "name": "deepseek-coder", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||||
{ "name": "bert-bge", "tokenizer_type": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
|
||||||
|
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# make directory "models/tokenizers" if it doesn't exist
|
# make directory "models/tokenizers" if it doesn't exist
|
||||||
|
@ -59,7 +61,7 @@ def download_file_with_auth(url, token, save_path):
|
||||||
for model in models:
|
for model in models:
|
||||||
name = model["name"]
|
name = model["name"]
|
||||||
repo = model["repo"]
|
repo = model["repo"]
|
||||||
tokenizer_type = model["tokenizer_type"]
|
tokt = model["tokt"]
|
||||||
|
|
||||||
if not os.path.exists(f"models/tokenizers/{name}"):
|
if not os.path.exists(f"models/tokenizers/{name}"):
|
||||||
os.makedirs(f"models/tokenizers/{name}")
|
os.makedirs(f"models/tokenizers/{name}")
|
||||||
|
@ -73,7 +75,7 @@ for model in models:
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||||
download_file_with_auth(url, token, save_path)
|
download_file_with_auth(url, token, save_path)
|
||||||
|
|
||||||
if tokenizer_type == TOKENIZER_TYPE.SPM:
|
if tokt == TOKENIZER_TYPE.SPM:
|
||||||
url = f"{repo}/resolve/main/tokenizer.model"
|
url = f"{repo}/resolve/main/tokenizer.model"
|
||||||
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
||||||
download_file_with_auth(url, token, save_path)
|
download_file_with_auth(url, token, save_path)
|
||||||
|
@ -88,9 +90,9 @@ for model in models:
|
||||||
src_ifs = ""
|
src_ifs = ""
|
||||||
for model in models:
|
for model in models:
|
||||||
name = model["name"]
|
name = model["name"]
|
||||||
tokenizer_type = model["tokenizer_type"]
|
tokt = model["tokt"]
|
||||||
|
|
||||||
if tokenizer_type == TOKENIZER_TYPE.SPM:
|
if tokt == TOKENIZER_TYPE.SPM:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# create the tokenizer
|
# create the tokenizer
|
||||||
|
@ -101,7 +103,7 @@ for model in models:
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
|
||||||
print(f"model: {name}")
|
print(f"model: {name}")
|
||||||
print(f"tokenizer_type: {tokenizer_type}")
|
print(f"tokt: {tokt}")
|
||||||
print(f"repo: {model['repo']}")
|
print(f"repo: {model['repo']}")
|
||||||
print(f"chktok: {chktok}")
|
print(f"chktok: {chktok}")
|
||||||
print(f"chkhsh: {chkhsh}")
|
print(f"chkhsh: {chkhsh}")
|
||||||
|
|
|
@ -401,11 +401,14 @@ class Model(ABC):
|
||||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||||
res = "llama-v3"
|
res = "llama-v3"
|
||||||
if chkhsh == "58c3d0e812ae7fa6a20931006d2398274732c105a9a964c148c43cf898c5fb7a":
|
if chkhsh == "58c3d0e812ae7fa6a20931006d2398274732c105a9a964c148c43cf898c5fb7a":
|
||||||
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat
|
# ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
|
||||||
res = "deepseek-llm"
|
res = "deepseek-llm"
|
||||||
if chkhsh == "0438d2a948d7fb26c7a662705ac68374f3138ee29e44f133b1f059203500fb4d":
|
if chkhsh == "0438d2a948d7fb26c7a662705ac68374f3138ee29e44f133b1f059203500fb4d":
|
||||||
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
|
# ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
|
||||||
res = "deepseek-coder"
|
res = "deepseek-coder"
|
||||||
|
if chkhsh == "822bdd323c3ef8667a9526b16b5bfe97974059838d992a170f965063f99c9b9e":
|
||||||
|
# ref: https://huggingface.co/tiiuae/falcon-7b
|
||||||
|
res = "falcon"
|
||||||
if chkhsh == "406f3f61e1c67d7b0456c5df2fce5cbb30c77dd3671a436b07a6c510303f721e":
|
if chkhsh == "406f3f61e1c67d7b0456c5df2fce5cbb30c77dd3671a436b07a6c510303f721e":
|
||||||
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
|
||||||
res = "bert-bge"
|
res = "bert-bge"
|
||||||
|
|
11
llama.cpp
11
llama.cpp
|
@ -4348,6 +4348,9 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "deepseek-coder") {
|
tokenizer_pre == "deepseek-coder") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "falcon") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -12112,6 +12115,14 @@ struct llm_tokenizer_bpe {
|
||||||
"\\p{N}+",
|
"\\p{N}+",
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
||||||
|
word_collection = unicode_regex_split(text, {
|
||||||
|
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
||||||
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
|
"\\p{N}+",
|
||||||
|
"[0-9][0-9][0-9]",
|
||||||
|
});
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -75,6 +75,7 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
|
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue