feat: small changes to allow jina embeddings ZH model

This commit is contained in:
Joan Martinez 2024-04-29 12:23:20 +02:00
parent c3f4b1f2d2
commit 603f18bc46

View file

@ -4303,8 +4303,7 @@ static void llm_load_vocab(
for (uint32_t i = 0; i < n_vocab; i++) { for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i); std::string word = gguf_get_arr_str(ctx, token_idx, i);
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); Remove check, some vocabs contain by mistake the NULL in vocab, (not ideal if it happens more than once) (jinaai-embeddings-v2-base-zh)
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
auto & token_data = vocab.id_to_token[i]; auto & token_data = vocab.id_to_token[i];
@ -4325,9 +4324,18 @@ static void llm_load_vocab(
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.linefeed_id = vocab.special_pad_id; vocab.linefeed_id = vocab.special_pad_id;
} else { } else {
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A try {
GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
vocab.linefeed_id = ids[0]; if (ids.empty()) {
LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), "\xC4\x8A");
vocab.linefeed_id = -1;
} else {
vocab.linefeed_id = ids[0];
}
} catch (const std::exception & e) {
LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), e.what());
vocab.linefeed_id = vocab.special_pad_id;
}
} }
// special tokens // special tokens