feat: small changes to allow jina embeddings ZH model

2024-04-29 12:23:20 +02:00 · 2024-04-29 12:23:20 +02:00 · 603f18bc46
commit 603f18bc46
parent c3f4b1f2d2
1 changed files with 13 additions and 5 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -4303,8 +4303,7 @@ static void llm_load_vocab(

    for (uint32_t i = 0; i < n_vocab; i++) {
        std::string word = gguf_get_arr_str(ctx, token_idx, i);
-        GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
-
+        //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); Remove check, some vocabs contain by mistake the NULL in vocab, (not ideal if it happens more than once) (jinaai-embeddings-v2-base-zh)
        vocab.token_to_id[word] = i;

        auto & token_data = vocab.id_to_token[i];
@ -4325,9 +4324,18 @@ static void llm_load_vocab(
    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
        vocab.linefeed_id = vocab.special_pad_id;
    } else {
-        const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
-        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
-        vocab.linefeed_id = ids[0];
+        try {
+            const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
+            if (ids.empty()) {
+                LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), "\xC4\x8A");
+                vocab.linefeed_id = -1;
+            } else {
+                vocab.linefeed_id = ids[0];
+            }
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), e.what());
+            vocab.linefeed_id = vocab.special_pad_id;
+        }
    }

    // special tokens