feat: add pre tokenization

This commit is contained in:
Joan Martinez 2024-05-02 11:59:03 +02:00
parent f6365b82cd
commit 14cd69a87d
4 changed files with 32 additions and 8 deletions

View file

@ -56,6 +56,9 @@ models = [
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{ "name": "jina-embeddings-v2-base-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{ "name": "jina-embeddings-v2-base-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{ "name": "jina-embeddings-v2-base-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
]
# make directory "models/tokenizers" if it doesn't exist

View file

@ -307,6 +307,15 @@ class Model(ABC):
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2"
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
res = "jina-embeddings-v2-base-es"
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-embeddings-v2-base-de"
if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
res = "jina-embeddings-v2-base-zh"
if res is None:
print("\n")

View file

@ -4417,6 +4417,15 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "gpt-2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "jina-embeddings-v2-base-es") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES;
} else if (
tokenizer_pre == "jina-embeddings-v2-base-de") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE;
} else if (
tokenizer_pre == "jina-embeddings-v2-base-zh") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}

19
llama.h
View file

@ -71,14 +71,17 @@ extern "C" {
// pre-tokenization types
enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES = 8,
LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE = 9,
LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH = 10,
};
// note: these values should be synchronized with ggml_rope