diff --git a/llama.cpp b/llama.cpp index 330df9de5..caff4c767 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4303,8 +4303,7 @@ static void llm_load_vocab( for (uint32_t i = 0; i < n_vocab; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); - GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); - + //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); Remove check, some vocabs contain by mistake the NULL in vocab, (not ideal if it happens more than once) (jinaai-embeddings-v2-base-zh) vocab.token_to_id[word] = i; auto & token_data = vocab.id_to_token[i]; @@ -4325,9 +4324,18 @@ static void llm_load_vocab( } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else { - const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A - GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); - vocab.linefeed_id = ids[0]; + try { + const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A + if (ids.empty()) { + LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), "\xC4\x8A"); + vocab.linefeed_id = -1; + } else { + vocab.linefeed_id = ids[0]; + } + } catch (const std::exception & e) { + LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), e.what()); + vocab.linefeed_id = vocab.special_pad_id; + } } // special tokens