From 01c9229186f6210186bcb44af4b5ed587e00895f Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 21:22:57 +0200 Subject: [PATCH] Refactor + add 'jina-v2' for testing 'lstrip' --- llama.cpp | 77 ++++++++++++++++++---------------- tests/test-tokenizer-random.py | 2 + 2 files changed, 44 insertions(+), 35 deletions(-) diff --git a/llama.cpp b/llama.cpp index 69f648a50..c282bceb7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4872,9 +4872,29 @@ static void llm_load_vocab( //NOTE: Each model customizes per token attributes. //NOTE: Per token attributes are missing from the GGUF file. //TODO: Merge llama_token_type and llama_token_attrib. + //TODO: Extract attribs from GGUF file. { + auto _contains_any = [] (const std::string &str, const std::vector &substrs) -> bool { + for (auto substr : substrs) { + if (str.find(substr) < std::string::npos) { + return true; + } + } + return false; + }; + + auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) { + uint32_t attribs = vocab.id_to_token.at(id).attribs; + attribs = value ? (attribs | attrib) : (attribs & ~attrib); + vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; + }; + + auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) { + _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value); + }; + // convert token type as an attribute - for (auto data : vocab.id_to_token) { + for (auto &data : vocab.id_to_token) { uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED; attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN); attrib |= LLAMA_TOKEN_ATTRIB_UNUSED * (data.type == LLAMA_TOKEN_TYPE_UNUSED); @@ -4885,44 +4905,31 @@ static void llm_load_vocab( data.attribs = (llama_token_attrib) attrib; } - // set attributes by model name std::string model_name; - if (ml.get_key(LLM_KV_GENERAL_NAME, model_name, false)) { - std::transform(model_name.begin(), model_name.end(), model_name.begin(), - [] (const std::string::value_type x) { - return std::tolower(x); - } - ); + std::string tokenizer_pre; - auto _contains_any = [&model_name] (const std::vector &substrs) -> bool { - for (auto substr : substrs) { - if (model_name.find(substr) < std::string::npos) { - return true; - } - } - return false; - }; + ml.get_key(LLM_KV_GENERAL_NAME, model_name, false); + ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - auto _set_tokenid_attrib = [&] (const llama_vocab::id id, llama_token_attrib attrib, bool value) { - uint32_t attribs = vocab.id_to_token[id].attribs; - attribs = value ? (attribs | attrib) : (attribs & ~attrib); - vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; - }; + // model name to lowercase + std::transform(model_name.begin(), model_name.end(), model_name.begin(), + [] (const std::string::value_type x) { + return std::tolower(x); + } + ); - auto _set_token_attrib = [&] (const std::string & token, llama_token_attrib attrib, bool value) { - _set_tokenid_attrib(vocab.token_to_id.at(token), attrib, value); - }; - - if (_contains_any({"phi-3", "phi3"})) { - for (auto id : vocab.cache_special_tokens) { - _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true); - } - for (auto token : {""}) { - _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); - } - for (auto token : {"", "", "<|endoftext|>"}) { - _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false); - } + // set attributes by model/tokenizer name + if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { + _set_token_attrib("", LLAMA_TOKEN_ATTRIB_LSTRIP, true); + } else if (_contains_any(model_name, {"phi-3", "phi3"})) { + for (auto id : vocab.cache_special_tokens) { + _set_tokenid_attrib(id, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {""}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {"", "", "<|endoftext|>"}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false); } } } diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 14f544c4d..9a84d9379 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -156,6 +156,8 @@ def generator_custom_text_edge_cases() -> Iterator[str]: 'a', # Phi-3 fail '<|endoftext|>', # Phi-3 fail 'a\na', # TODO: Bert fail + 'a b', # rstrip phi-3 + 'a b', # lstrip jina-v2 ]