diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index b019c1e3d..d2c6db16b 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -56,6 +56,9 @@ models = [ { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + { "name": "jina-embeddings-v2-base-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, + { "name": "jina-embeddings-v2-base-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + { "name": "jina-embeddings-v2-base-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", }, ] # make directory "models/tokenizers" if it doesn't exist diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 0f4085072..67839f341 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -307,6 +307,15 @@ class Model(ABC): if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es + res = "jina-embeddings-v2-base-es" + if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de + res = "jina-embeddings-v2-base-de" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh + res = "jina-embeddings-v2-base-zh" if res is None: print("\n") diff --git a/llama.cpp b/llama.cpp index e76f58820..23013ddca 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4417,6 +4417,15 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "gpt-2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "jina-embeddings-v2-base-es") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES; + } else if ( + tokenizer_pre == "jina-embeddings-v2-base-de") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE; + } else if ( + tokenizer_pre == "jina-embeddings-v2-base-zh") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } diff --git a/llama.h b/llama.h index 059d78f11..d99663155 100644 --- a/llama.h +++ b/llama.h @@ -71,14 +71,17 @@ extern "C" { // pre-tokenization types enum llama_vocab_pre_type { - LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, - LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, - LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, - LLAMA_VOCAB_PRE_TYPE_FALCON = 4, - LLAMA_VOCAB_PRE_TYPE_MPT = 5, - LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, - LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, + LLAMA_VOCAB_PRE_TYPE_FALCON = 4, + LLAMA_VOCAB_PRE_TYPE_MPT = 5, + LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, + LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES = 8, + LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE = 9, + LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH = 10, }; // note: these values should be synchronized with ggml_rope