Add per token attrib enum

This commit is contained in:
jaime-m-p 2024-06-01 19:42:21 +02:00
parent 750f60c03e
commit cec6a3bde9
2 changed files with 40 additions and 6 deletions

View file

@ -2147,14 +2147,16 @@ struct llama_control_vector {
}; };
struct llama_vocab { struct llama_vocab {
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
using ttype = llama_token_type; using ttype = llama_token_type;
using tattrib = llama_token_attrib;
struct token_data { struct token_data {
token text; token text;
float score; float score;
ttype type; ttype type;
tattrib attribs;
}; };
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@ -4865,6 +4867,24 @@ static void llm_load_vocab(
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
} }
// Handle per token attributes
//NOTE: Each model customizes per token attributes.
//NOTE: Per token attributes are missing from the GGUF file.
//TODO: Merge llama_token_type and llama_token_attrib.
{
// convert token type as an attribute
for (auto data : vocab.id_to_token) {
uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED;
attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN);
attrib |= LLAMA_TOKEN_ATTRIB_UNUSED * (data.type == LLAMA_TOKEN_TYPE_UNUSED);
attrib |= LLAMA_TOKEN_ATTRIB_NORMAL * (data.type == LLAMA_TOKEN_TYPE_NORMAL);
attrib |= LLAMA_TOKEN_ATTRIB_CONTROL * (data.type == LLAMA_TOKEN_TYPE_CONTROL);
attrib |= LLAMA_TOKEN_ATTRIB_USER_DEFINED * (data.type == LLAMA_TOKEN_TYPE_USER_DEFINED);
attrib |= LLAMA_TOKEN_ATTRIB_BYTE * (data.type == LLAMA_TOKEN_TYPE_BYTE);
data.attribs = (llama_token_attrib) attrib;
}
}
} }
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {

14
llama.h
View file

@ -107,6 +107,20 @@ extern "C" {
LLAMA_TOKEN_TYPE_BYTE = 6, LLAMA_TOKEN_TYPE_BYTE = 6,
}; };
enum llama_token_attrib {
LLAMA_TOKEN_ATTRIB_UNDEFINED = 0,
LLAMA_TOKEN_ATTRIB_UNKNOWN = 1 << 1,
LLAMA_TOKEN_ATTRIB_UNUSED = 1 << 2,
LLAMA_TOKEN_ATTRIB_NORMAL = 1 << 3,
LLAMA_TOKEN_ATTRIB_CONTROL = 1 << 4, // SPECIAL?
LLAMA_TOKEN_ATTRIB_USER_DEFINED = 1 << 5,
LLAMA_TOKEN_ATTRIB_BYTE = 1 << 6,
LLAMA_TOKEN_ATTRIB_NORMALIZED = 1 << 7,
LLAMA_TOKEN_ATTRIB_LSTRIP = 1 << 8,
LLAMA_TOKEN_ATTRIB_RSTRIP = 1 << 9,
LLAMA_TOKEN_ATTRIB_SINGLE_WORD = 1 << 10,
};
// model file types // model file types
enum llama_ftype { enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0, LLAMA_FTYPE_ALL_F32 = 0,