llama : vocab load

This commit is contained in:
Georgi Gerganov 2025-01-08 20:49:26 +02:00
parent 695a0037db
commit 885495ccd1
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
4 changed files with 864 additions and 839 deletions

View file

@ -1250,715 +1250,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
void llama_model::load_vocab(llama_model_loader & ml) {
struct gguf_context * ctx = ml.meta.get();
const auto kv = LLM_KV(arch);
// determine vocab type
{
std::string tokenizer_model;
std::string tokenizer_pre;
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
vocab.type = LLAMA_VOCAB_TYPE_NONE;
// default special tokens
vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;
vocab.linefeed_id = LLAMA_TOKEN_NULL;
// read vocab size from metadata
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
vocab.n_vocab = 0;
LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
}
return;
}
if (tokenizer_model == "llama") {
vocab.type = LLAMA_VOCAB_TYPE_SPM;
// default special tokens
vocab.special_bos_id = 1;
vocab.special_eos_id = 2;
vocab.special_unk_id = 0;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens
vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = 100;
vocab.special_sep_id = 102;
vocab.special_pad_id = 0;
vocab.special_cls_id = 101;
vocab.special_mask_id = 103;
} else if (tokenizer_model == "gpt2") {
vocab.type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
if (merges_keyidx == -1) {
throw std::runtime_error("cannot find tokenizer merges in model file\n");
}
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
std::string first;
std::string second;
const size_t pos = word.find(' ', 1);
if (pos != std::string::npos) {
first = word.substr(0, pos);
second = word.substr(pos + 1);
}
vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
}
// default special tokens
vocab.special_bos_id = 11;
vocab.special_eos_id = 11;
vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "t5") {
vocab.type = LLAMA_VOCAB_TYPE_UGM;
// default special tokens
vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = 1;
vocab.special_unk_id = 2;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = 0;
vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = LLAMA_TOKEN_NULL;
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
for (size_t i = 0; i < xcda_array_size; ++i) {
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
}
#endif
}
} else if (tokenizer_model == "rwkv") {
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
// default special tokens
vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = LLAMA_TOKEN_NULL;
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
// for now, only BPE models have pre-tokenizers
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
if (tokenizer_pre.empty()) {
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__);
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||
tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
vocab.tokenizer_ignore_merges = true;
vocab.tokenizer_add_bos = true;
} else if (
tokenizer_pre == "deepseek-llm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "deepseek-coder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "deepseek-v3") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "falcon") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
} else if (
tokenizer_pre == "mpt") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
} else if (
tokenizer_pre == "starcoder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
} else if (
tokenizer_pre == "gpt-2" ||
tokenizer_pre == "phi-2" ||
tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de" ||
tokenizer_pre == "gigachat" ||
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "jina-v2-code" ||
tokenizer_pre == "roberta-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else if (
tokenizer_pre == "command-r") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "qwen2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "stablelm2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
} else if (
tokenizer_pre == "olmo") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
} else if (
tokenizer_pre == "dbrx") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
} else if (
tokenizer_pre == "smaug-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
} else if (
tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "chatglm-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
vocab.special_bos_id = LLAMA_TOKEN_NULL;
} else if (
tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "jais") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
} else if (
tokenizer_pre == "tekken") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_ignore_merges = true;
vocab.tokenizer_add_bos = true;
} else if (
tokenizer_pre == "smollm") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "codeshell") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
} else if (
tokenizer_pre == "bloom") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
} else if (
tokenizer_pre == "gpt3-finnish") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
} else if (
tokenizer_pre == "exaone") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
} else if (
tokenizer_pre == "chameleon") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
vocab.tokenizer_add_bos = true;
vocab.tokenizer_clean_spaces = false;
} else if (
tokenizer_pre == "minerva-7b") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
} else if (
tokenizer_pre == "megrez") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.tokenizer_add_space_prefix = true;
vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_add_bos = true;
vocab.tokenizer_add_eos = false;
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = true;
vocab.tokenizer_add_bos = true;
vocab.tokenizer_add_eos = false;
} else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.tokenizer_add_bos = false;
vocab.tokenizer_add_eos = true;
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
vocab.tokenizer_add_space_prefix = false;
vocab.tokenizer_clean_spaces = false;
vocab.tokenizer_add_bos = false;
vocab.tokenizer_add_eos = false;
} else {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
}
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
}
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}
const float * scores = nullptr;
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
if (score_idx != -1) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
}
const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) {
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
}
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
vocab.n_vocab = n_vocab;
vocab.id_to_token.resize(n_vocab);
for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
if (word.empty()) {
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
word = "[EMPTY_" + std::to_string(i) + "]";
}
vocab.token_to_id[word] = i;
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores ? scores[i] : 0.0f;
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
switch(toktypes[i]) {
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
}
}
}
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
vocab.init_tokenizer();
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
try {
vocab.linefeed_id = vocab.byte_to_token('\n');
} catch (const std::exception & e) {
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
vocab.linefeed_id = vocab.special_pad_id;
}
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
vocab.linefeed_id = vocab.special_pad_id;
} else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
const std::vector<int> ids = vocab.tokenize("\n", false);
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
vocab.linefeed_id = ids[0];
} else {
const std::vector<int> ids = vocab.tokenize("\xC4\x8A", false); // U+010A
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
if (ids.empty()) {
LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
vocab.linefeed_id = vocab.special_pad_id;
} else {
vocab.linefeed_id = ids[0];
}
}
// special tokens
{
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
{ LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
};
for (const auto & it : special_token_types) {
const std::string & key = kv(std::get<0>(it));
int32_t & id = std::get<1>(it);
uint32_t new_id;
if (!ml.get_key(std::get<0>(it), new_id, false)) {
continue;
}
if (new_id >= vocab.id_to_token.size()) {
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
__func__, key.c_str(), new_id, id);
} else {
id = new_id;
}
}
// Handle add_bos_token and add_eos_token
{
bool temp = true;
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
vocab.tokenizer_add_bos = temp;
}
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
vocab.tokenizer_add_eos = temp;
}
}
// auto-detect special tokens by text
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
// for now, we apply this workaround to find the tokens based on their text
for (const auto & t : vocab.token_to_id) {
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<|end|>"
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<EOT>"
|| t.first == "<end▁of▁sentence>" // DeepSeek
) {
vocab.special_eot_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find EOM token: "<|eom_id|>"
if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|eom_id|>"
) {
vocab.special_eom_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_prefix|>" // Qwen
|| t.first == "<fim-prefix>"
|| t.first == "<fim▁begin>" // DeepSeek
|| t.first == "<PRE>"
) {
vocab.special_fim_pre_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_suffix|>" // Qwen
|| t.first == "<fim-suffix>"
|| t.first == "<fim▁hole>" // DeepSeek
|| t.first == "<SUF>"
) {
vocab.special_fim_suf_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_middle|>" // Qwen
|| t.first == "<fim-middle>"
|| t.first == "<fim▁end>" // DeepSeek
|| t.first == "<MID>"
) {
vocab.special_fim_mid_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_pad|>" // Qwen
|| t.first == "<fim-pad>"
|| t.first == "<PAD>"
) {
vocab.special_fim_pad_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_repo|>" // Qwen
|| t.first == "<|repo_name|>"
|| t.first == "<fim-repo>"
|| t.first == "<REPO>"
) {
vocab.special_fim_rep_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_SEP token: "<|file_sep|>"
if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|file_sep|>" // Qwen
) {
vocab.special_fim_sep_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
}
// maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
vocab.special_eog_ids.clear();
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
}
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
}
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
}
for (const auto & t : vocab.token_to_id) {
if (false
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<|end|>"
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<|eom_id|>"
|| t.first == "<EOT>"
) {
vocab.special_eog_ids.insert(t.second);
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} else {
// token is control, but not marked as EOG -> print a debug log
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
__func__, t.second, t.first.c_str());
}
}
}
// sanity checks
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eos_id);
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eot_id);
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eom_id);
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
}
// build special tokens cache
{
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
vocab.cache_special_tokens.push_back(id);
}
}
std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
[&] (const llama_vocab::id a, const llama_vocab::id b) {
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
}
);
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
}
// build token to piece cache
{
size_t size_cache = 0;
std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
for (uint32_t id = 0; id < n_vocab; ++id) {
cache_token_to_piece[id] = token_to_piece(id, true);
size_cache += cache_token_to_piece[id].size();
}
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
}
// Handle per token attributes
//NOTE: Each model customizes per token attributes.
//NOTE: Per token attributes are missing from the GGUF file.
//TODO: Extract attributes from GGUF file.
{
auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
for (const auto & substr : substrs) {
if (str.find(substr) < std::string::npos) {
return true;
}
}
return false;
};
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
uint32_t current = vocab.id_to_token.at(id).attr;
current = value ? (current | attr) : (current & ~attr);
vocab.id_to_token[id].attr = (llama_token_attr) current;
};
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
};
std::string model_name;
std::string tokenizer_pre;
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
// model name to lowercase
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
[] (const std::string::value_type x) {
return std::tolower(x);
}
);
// set attributes by model/tokenizer name
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
for (auto id : vocab.cache_special_tokens) {
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (auto token : {"</s>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
}
}
}
vocab.load(ml, kv);
}
bool llama_model::load_tensors(llama_model_loader & ml) {
@ -4441,23 +3735,6 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
return it->second;
}
// NOTE: avoid ever using this except for building the token_to_piece caches
std::string llama_model::token_to_piece(llama_token token, bool special) const {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache
const int n_chars = llama_token_to_piece(this, token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) {
piece.resize(-n_chars);
int check = llama_token_to_piece(this, token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars);
}
else {
piece.resize(n_chars);
}
return piece;
}
//
// interface implementation
//

View file

@ -367,8 +367,6 @@ struct llama_model {
private:
struct impl;
std::unique_ptr<impl> pimpl;
std::string token_to_piece(llama_token token, bool special) const;
};
const char * llm_type_name(llm_type type);

File diff suppressed because it is too large Load diff

View file

@ -9,6 +9,9 @@
#include <set>
#include <memory>
struct LLM_KV;
struct llama_model_loader;
struct llama_vocab {
using id = llama_token;
using token = std::string;
@ -75,9 +78,7 @@ struct llama_vocab {
llama_vocab();
~llama_vocab();
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
void init_tokenizer();
void load(llama_model_loader & ml, const LLM_KV & kv);
enum llama_vocab_type get_type() const;
@ -121,6 +122,8 @@ struct llama_vocab {
bool add_bos_token() const;
bool add_eos_token() const;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
std::vector<id> tokenize(
std::string raw_text,
bool add_special,
@ -162,4 +165,8 @@ struct llama_vocab {
private:
struct impl;
std::unique_ptr<impl> pimpl;
std::string token_to_piece_for_cache(
llama_token token,
bool special) const;
};