llama : fix pre-tokenizers
This commit is contained in:
parent
e59b54657a
commit
b7ede48294
3 changed files with 56 additions and 42 deletions
|
@ -55,8 +55,8 @@ else:
|
|||
|
||||
# TODO: add models here, base models preferred
|
||||
models = [
|
||||
#{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
#{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
|
||||
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
|
||||
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
|
||||
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
|
||||
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
|
||||
|
@ -70,8 +70,9 @@ models = [
|
|||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||
{"name": "jina-embeddings-v2-base-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-embeddings-v2-base-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "jina-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
|
|
|
@ -322,12 +322,17 @@ class Model(ABC):
|
|||
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
||||
res = "olmo"
|
||||
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
||||
# ref: https://huggingface.co/databricks/dbrx-instruct
|
||||
# ref: https://huggingface.co/databricks/dbrx-base
|
||||
res = "dbrx"
|
||||
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
||||
res = "jina-en"
|
||||
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
|
||||
res = "default"
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
|
||||
res = "jina-es"
|
||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
res = "default"
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-de"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
10
llama.cpp
10
llama.cpp
|
@ -3800,6 +3800,12 @@ static void llm_load_hparams(
|
|||
|
||||
// get hparams kv
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
||||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
return;
|
||||
}
|
||||
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
||||
|
@ -4417,7 +4423,9 @@ static void llm_load_vocab(
|
|||
tokenizer_pre == "starcoder") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
||||
} else if (
|
||||
tokenizer_pre == "gpt-2") {
|
||||
tokenizer_pre == "gpt-2" ||
|
||||
tokenizer_pre == "jina-es" ||
|
||||
tokenizer_pre == "jina-de") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue