feat: small changes to allow jina embeddings ZH model
This commit is contained in:
parent
c3f4b1f2d2
commit
603f18bc46
1 changed files with 13 additions and 5 deletions
18
llama.cpp
18
llama.cpp
|
@ -4303,8 +4303,7 @@ static void llm_load_vocab(
|
|||
|
||||
for (uint32_t i = 0; i < n_vocab; i++) {
|
||||
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
||||
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
||||
|
||||
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); Remove check, some vocabs contain by mistake the NULL in vocab, (not ideal if it happens more than once) (jinaai-embeddings-v2-base-zh)
|
||||
vocab.token_to_id[word] = i;
|
||||
|
||||
auto & token_data = vocab.id_to_token[i];
|
||||
|
@ -4325,9 +4324,18 @@ static void llm_load_vocab(
|
|||
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
||||
vocab.linefeed_id = vocab.special_pad_id;
|
||||
} else {
|
||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
||||
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||||
vocab.linefeed_id = ids[0];
|
||||
try {
|
||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
|
||||
if (ids.empty()) {
|
||||
LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), "\xC4\x8A");
|
||||
vocab.linefeed_id = -1;
|
||||
} else {
|
||||
vocab.linefeed_id = ids[0];
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), e.what());
|
||||
vocab.linefeed_id = vocab.special_pad_id;
|
||||
}
|
||||
}
|
||||
|
||||
// special tokens
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue