feat: add pre tokenization
This commit is contained in:
parent
f6365b82cd
commit
14cd69a87d
4 changed files with 32 additions and 8 deletions
|
@ -56,6 +56,9 @@ models = [
|
||||||
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
|
||||||
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
|
||||||
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||||
|
{ "name": "jina-embeddings-v2-base-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
|
{ "name": "jina-embeddings-v2-base-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
|
{ "name": "jina-embeddings-v2-base-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# make directory "models/tokenizers" if it doesn't exist
|
# make directory "models/tokenizers" if it doesn't exist
|
||||||
|
|
|
@ -307,6 +307,15 @@ class Model(ABC):
|
||||||
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
|
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
|
||||||
# ref: https://huggingface.co/openai-community/gpt2
|
# ref: https://huggingface.co/openai-community/gpt2
|
||||||
res = "gpt-2"
|
res = "gpt-2"
|
||||||
|
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
|
||||||
|
res = "jina-embeddings-v2-base-es"
|
||||||
|
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||||
|
res = "jina-embeddings-v2-base-de"
|
||||||
|
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
|
||||||
|
res = "jina-embeddings-v2-base-zh"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
print("\n")
|
print("\n")
|
||||||
|
|
|
@ -4417,6 +4417,15 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "gpt-2") {
|
tokenizer_pre == "gpt-2") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "jina-embeddings-v2-base-es") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "jina-embeddings-v2-base-de") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "jina-embeddings-v2-base-zh") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
|
19
llama.h
19
llama.h
|
@ -71,14 +71,17 @@ extern "C" {
|
||||||
|
|
||||||
// pre-tokenization types
|
// pre-tokenization types
|
||||||
enum llama_vocab_pre_type {
|
enum llama_vocab_pre_type {
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES = 8,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE = 9,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH = 10,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue