diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index c1d53aaf0..deca36bf9 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -6,6 +6,7 @@ # python3 convert-hf-to-gguf-update.py # # - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py +# - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp # @@ -33,13 +34,14 @@ else: print("Usage: python convert-hf-to-gguf-update.py ") sys.exit(1) -# TODO: add models here +# TODO: add models here, base models preferred models = [ - { "name": "llama-v2", "tokenizer_type": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, - { "name": "llama-v3", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, - { "name": "deepseek-llm", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat", }, - { "name": "deepseek-coder", "tokenizer_type": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, - { "name": "bert-bge", "tokenizer_type": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + { "name": "llama-v2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, + { "name": "llama-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, + { "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, + { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, + { "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, + { "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, ] # make directory "models/tokenizers" if it doesn't exist @@ -59,7 +61,7 @@ def download_file_with_auth(url, token, save_path): for model in models: name = model["name"] repo = model["repo"] - tokenizer_type = model["tokenizer_type"] + tokt = model["tokt"] if not os.path.exists(f"models/tokenizers/{name}"): os.makedirs(f"models/tokenizers/{name}") @@ -73,7 +75,7 @@ for model in models: save_path = f"models/tokenizers/{name}/tokenizer.json" download_file_with_auth(url, token, save_path) - if tokenizer_type == TOKENIZER_TYPE.SPM: + if tokt == TOKENIZER_TYPE.SPM: url = f"{repo}/resolve/main/tokenizer.model" save_path = f"models/tokenizers/{name}/tokenizer.model" download_file_with_auth(url, token, save_path) @@ -88,9 +90,9 @@ for model in models: src_ifs = "" for model in models: name = model["name"] - tokenizer_type = model["tokenizer_type"] + tokt = model["tokt"] - if tokenizer_type == TOKENIZER_TYPE.SPM: + if tokt == TOKENIZER_TYPE.SPM: continue # create the tokenizer @@ -101,7 +103,7 @@ for model in models: chkhsh = sha256(str(chktok).encode()).hexdigest() print(f"model: {name}") - print(f"tokenizer_type: {tokenizer_type}") + print(f"tokt: {tokt}") print(f"repo: {model['repo']}") print(f"chktok: {chktok}") print(f"chkhsh: {chkhsh}") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 9b2f68cfd..56475a49a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -401,11 +401,14 @@ class Model(ABC): # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-v3" if chkhsh == "58c3d0e812ae7fa6a20931006d2398274732c105a9a964c148c43cf898c5fb7a": - # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" if chkhsh == "0438d2a948d7fb26c7a662705ac68374f3138ee29e44f133b1f059203500fb4d": # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base res = "deepseek-coder" + if chkhsh == "822bdd323c3ef8667a9526b16b5bfe97974059838d992a170f965063f99c9b9e": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" if chkhsh == "406f3f61e1c67d7b0456c5df2fce5cbb30c77dd3671a436b07a6c510303f721e": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" diff --git a/llama.cpp b/llama.cpp index eaf9a8da0..98316161c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4348,6 +4348,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "deepseek-coder") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; + } else if ( + tokenizer_pre == "falcon") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12112,6 +12115,14 @@ struct llm_tokenizer_bpe { "\\p{N}+", }); break; + case LLAMA_VOCAB_PRE_TYPE_FALCON: + word_collection = unicode_regex_split(text, { + "[\\p{P}\\$\\+<=>\\^~\\|]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]", + }); + break; default: // default regex for BPE tokenization pre-processing word_collection = unicode_regex_split(text, { diff --git a/llama.h b/llama.h index 083ce22f1..3beb80e0c 100644 --- a/llama.h +++ b/llama.h @@ -75,6 +75,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, + LLAMA_VOCAB_PRE_TYPE_FALCON = 4, }; // note: these values should be synchronized with ggml_rope diff --git a/models/ggml-vocab-falcon.gguf b/models/ggml-vocab-falcon.gguf index d4ea2e822..334d50da5 100644 Binary files a/models/ggml-vocab-falcon.gguf and b/models/ggml-vocab-falcon.gguf differ