feat: add pre tokenization

2024-05-02 11:59:03 +02:00 · 2024-05-02 11:59:03 +02:00 · 14cd69a87d
commit 14cd69a87d
parent f6365b82cd
4 changed files with 32 additions and 8 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -56,6 +56,9 @@ models = [
        { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
        { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
        { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+        { "name": "jina-embeddings-v2-base-es",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+        { "name": "jina-embeddings-v2-base-de",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+        { "name": "jina-embeddings-v2-base-zh",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
        ]

 # make directory "models/tokenizers" if it doesn't exist
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -307,6 +307,15 @@ class Model(ABC):
        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
            # ref: https://huggingface.co/openai-community/gpt2
            res = "gpt-2"
+        if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
+            res = "jina-embeddings-v2-base-es"
+        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
+            res = "jina-embeddings-v2-base-de"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh
+            res = "jina-embeddings-v2-base-zh"

        if res is None:
            print("\n")
--- a/llama.cpp
+++ b/llama.cpp
@ -4417,6 +4417,15 @@ static void llm_load_vocab(
            } else if (
                    tokenizer_pre == "gpt-2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "jina-embeddings-v2-base-es") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES;
+            } else if (
+                    tokenizer_pre == "jina-embeddings-v2-base-de") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE;
+            } else if (
+                    tokenizer_pre == "jina-embeddings-v2-base-zh") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
--- a/llama.h
+++ b/llama.h
@ -71,14 +71,17 @@ extern "C" {

    // pre-tokenization types
    enum llama_vocab_pre_type {
-        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
-        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
-        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
-        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
-        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
-        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
-        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
-        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_DEFAULT               = 0,
+        LLAMA_VOCAB_PRE_TYPE_LLAMA3                = 1,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM          = 2,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER        = 3,
+        LLAMA_VOCAB_PRE_TYPE_FALCON                = 4,
+        LLAMA_VOCAB_PRE_TYPE_MPT                   = 5,
+        LLAMA_VOCAB_PRE_TYPE_STARCODER             = 6,
+        LLAMA_VOCAB_PRE_TYPE_GPT2                  = 7,
+        LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ES = 8,
+        LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_DE = 9,
+        LLAMA_VOCAB_PRE_TYPE_JINA_EMBEDDINGS_V2_ZH = 10,
    };

    // note: these values should be synchronized with ggml_rope