From 3ead1b9757e417533408101e9287313c2965cdeb Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 19:45:14 +0200 Subject: [PATCH] Using phi-3 for testing 'rstrip' --- llama.cpp | 38 ++++++++++++++++++++++++++++++++++ tests/test-tokenizer-random.py | 6 +++--- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 02f7be2c1..0e77585b5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4884,6 +4884,44 @@ static void llm_load_vocab( attrib |= LLAMA_TOKEN_ATTRIB_BYTE * (data.type == LLAMA_TOKEN_TYPE_BYTE); data.attribs = (llama_token_attrib) attrib; } + + // set attributes by model name + std::string model_name; + if (ml.get_key(LLM_KV_GENERAL_NAME, model_name, false)) { + std::transform(model_name.begin(), model_name.end(), model_name.begin(), + [] (const std::string::value_type x) { + return std::tolower(x); + } + ); + + auto _contains_any = [&model_name] (const std::vector &substrs) -> bool { + for (auto substr : substrs) { + if (model_name.find(substr) < std::string::npos) { + return true; + } + } + return false; + }; + + auto _set_token_attrib = [&vocab] (const std::string & token, llama_token_attrib attrib, bool value) { + llama_vocab::id id = vocab.token_to_id.at(token); + uint32_t attribs = vocab.id_to_token[id].attribs; + attribs = value ? (attribs | attrib) : (attribs & ~attrib); + vocab.id_to_token[id].attribs = (llama_token_attrib) attribs; + }; + + if (_contains_any({"phi-3", "phi3"})) { + for (auto token : vocab.cache_token_to_piece_special) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {""}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, true); + } + for (auto token : {"", "", "<|endoftext|>"}) { + _set_token_attrib(token, LLAMA_TOKEN_ATTRIB_RSTRIP, false); + } + } + } } } diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index ec1b2837c..14f544c4d 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -329,9 +329,9 @@ if __name__ == "__main__": # tokenizers = os.listdir(path_tokenizers) tokenizers = [ # "llama-spm", # SPM - # "phi-3", # SPM - "jina-v2-en", # WPM - "bert-bge", # WPM + "phi-3", # SPM + # "jina-v2-en", # WPM + # "bert-bge", # WPM ] for tokenizer in tokenizers: