From cec6a3bde95a74bb59f09f0b8ac2010a56b33d49 Mon Sep 17 00:00:00 2001 From: jaime-m-p <> Date: Sat, 1 Jun 2024 19:42:21 +0200 Subject: [PATCH] Add per token attrib enum --- llama.cpp | 32 ++++++++++++++++++++++++++------ llama.h | 14 ++++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 841be1de7..02f7be2c1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2147,14 +2147,16 @@ struct llama_control_vector { }; struct llama_vocab { - using id = int32_t; - using token = std::string; - using ttype = llama_token_type; + using id = int32_t; + using token = std::string; + using ttype = llama_token_type; + using tattrib = llama_token_attrib; struct token_data { - token text; - float score; - ttype type; + token text; + float score; + ttype type; + tattrib attribs; }; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; @@ -4865,6 +4867,24 @@ static void llm_load_vocab( LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); } + + // Handle per token attributes + //NOTE: Each model customizes per token attributes. + //NOTE: Per token attributes are missing from the GGUF file. + //TODO: Merge llama_token_type and llama_token_attrib. + { + // convert token type as an attribute + for (auto data : vocab.id_to_token) { + uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED; + attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN); + attrib |= LLAMA_TOKEN_ATTRIB_UNUSED * (data.type == LLAMA_TOKEN_TYPE_UNUSED); + attrib |= LLAMA_TOKEN_ATTRIB_NORMAL * (data.type == LLAMA_TOKEN_TYPE_NORMAL); + attrib |= LLAMA_TOKEN_ATTRIB_CONTROL * (data.type == LLAMA_TOKEN_TYPE_CONTROL); + attrib |= LLAMA_TOKEN_ATTRIB_USER_DEFINED * (data.type == LLAMA_TOKEN_TYPE_USER_DEFINED); + attrib |= LLAMA_TOKEN_ATTRIB_BYTE * (data.type == LLAMA_TOKEN_TYPE_BYTE); + data.attribs = (llama_token_attrib) attrib; + } + } } static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { diff --git a/llama.h b/llama.h index 95105c28e..a9952d6e0 100644 --- a/llama.h +++ b/llama.h @@ -107,6 +107,20 @@ extern "C" { LLAMA_TOKEN_TYPE_BYTE = 6, }; + enum llama_token_attrib { + LLAMA_TOKEN_ATTRIB_UNDEFINED = 0, + LLAMA_TOKEN_ATTRIB_UNKNOWN = 1 << 1, + LLAMA_TOKEN_ATTRIB_UNUSED = 1 << 2, + LLAMA_TOKEN_ATTRIB_NORMAL = 1 << 3, + LLAMA_TOKEN_ATTRIB_CONTROL = 1 << 4, // SPECIAL? + LLAMA_TOKEN_ATTRIB_USER_DEFINED = 1 << 5, + LLAMA_TOKEN_ATTRIB_BYTE = 1 << 6, + LLAMA_TOKEN_ATTRIB_NORMALIZED = 1 << 7, + LLAMA_TOKEN_ATTRIB_LSTRIP = 1 << 8, + LLAMA_TOKEN_ATTRIB_RSTRIP = 1 << 9, + LLAMA_TOKEN_ATTRIB_SINGLE_WORD = 1 << 10, + }; + // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0,