From c863752ca7b9a8282ee695a19edc659a445c670b Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Wed, 12 Jun 2024 23:00:04 +0200 Subject: [PATCH] Generalize 'jina-v2' per token attributes --- llama.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 215f514f7..b46ebdd5f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2187,7 +2187,6 @@ struct llama_vocab { bool tokenizer_add_bos = false; bool tokenizer_add_eos = false; bool tokenizer_ignore_merges = false; - bool tokenizer_mask_lstrip = false; int find_bpe_rank(const std::string & token_left, const std::string & token_right) const { GGML_ASSERT(token_left.find(' ') == std::string::npos); @@ -4688,7 +4687,6 @@ static void llm_load_vocab( tokenizer_pre == "jina-v2-de" || tokenizer_pre == "jina-v2-code") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; - vocab.tokenizer_mask_lstrip = tokenizer_pre.find("jina-v2") < std::string::npos; } else if ( tokenizer_pre == "refact") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; @@ -4929,7 +4927,7 @@ static void llm_load_vocab( ); // set attributes by model/tokenizer name - if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { + if (_contains_any(tokenizer_pre, {"jina-v2-"})) { _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); } else if (_contains_any(model_name, {"phi-3", "phi3"})) { for (auto id : vocab.cache_special_tokens) {