From 885495ccd1339d50e941ac63228ee5270e4bca2a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 8 Jan 2025 20:49:26 +0200 Subject: [PATCH] llama : vocab load --- src/llama-model.cpp | 725 +-------------------------------- src/llama-model.h | 2 - src/llama-vocab.cpp | 963 +++++++++++++++++++++++++++++++++++++++----- src/llama-vocab.h | 13 +- 4 files changed, 864 insertions(+), 839 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e2405b979..39c60ee1f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1250,715 +1250,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { } void llama_model::load_vocab(llama_model_loader & ml) { - struct gguf_context * ctx = ml.meta.get(); - const auto kv = LLM_KV(arch); - // determine vocab type - { - std::string tokenizer_model; - std::string tokenizer_pre; - - ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); - ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - - if (tokenizer_model == "no_vocab" || tokenizer_model == "none") { - vocab.type = LLAMA_VOCAB_TYPE_NONE; - - // default special tokens - vocab.special_bos_id = LLAMA_TOKEN_NULL; - vocab.special_eos_id = LLAMA_TOKEN_NULL; - vocab.special_unk_id = LLAMA_TOKEN_NULL; - vocab.special_sep_id = LLAMA_TOKEN_NULL; - vocab.special_pad_id = LLAMA_TOKEN_NULL; - vocab.special_cls_id = LLAMA_TOKEN_NULL; - vocab.special_mask_id = LLAMA_TOKEN_NULL; - vocab.linefeed_id = LLAMA_TOKEN_NULL; - - // read vocab size from metadata - if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { - vocab.n_vocab = 0; - LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab); - } - - return; - } - - if (tokenizer_model == "llama") { - vocab.type = LLAMA_VOCAB_TYPE_SPM; - - // default special tokens - vocab.special_bos_id = 1; - vocab.special_eos_id = 2; - vocab.special_unk_id = 0; - vocab.special_sep_id = LLAMA_TOKEN_NULL; - vocab.special_pad_id = LLAMA_TOKEN_NULL; - vocab.special_cls_id = LLAMA_TOKEN_NULL; - vocab.special_mask_id = LLAMA_TOKEN_NULL; - } else if (tokenizer_model == "bert") { - vocab.type = LLAMA_VOCAB_TYPE_WPM; - - // default special tokens - vocab.special_bos_id = LLAMA_TOKEN_NULL; - vocab.special_eos_id = LLAMA_TOKEN_NULL; - vocab.special_unk_id = 100; - vocab.special_sep_id = 102; - vocab.special_pad_id = 0; - vocab.special_cls_id = 101; - vocab.special_mask_id = 103; - } else if (tokenizer_model == "gpt2") { - vocab.type = LLAMA_VOCAB_TYPE_BPE; - - // read bpe merges and populate bpe ranks - const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); - if (merges_keyidx == -1) { - throw std::runtime_error("cannot find tokenizer merges in model file\n"); - } - - const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); - for (int i = 0; i < n_merges; i++) { - const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); - //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); - - std::string first; - std::string second; - - const size_t pos = word.find(' ', 1); - - if (pos != std::string::npos) { - first = word.substr(0, pos); - second = word.substr(pos + 1); - } - - vocab.bpe_ranks.emplace(std::make_pair(first, second), i); - } - - // default special tokens - vocab.special_bos_id = 11; - vocab.special_eos_id = 11; - vocab.special_unk_id = LLAMA_TOKEN_NULL; - vocab.special_sep_id = LLAMA_TOKEN_NULL; - vocab.special_pad_id = LLAMA_TOKEN_NULL; - vocab.special_cls_id = LLAMA_TOKEN_NULL; - vocab.special_mask_id = LLAMA_TOKEN_NULL; - } else if (tokenizer_model == "t5") { - vocab.type = LLAMA_VOCAB_TYPE_UGM; - - // default special tokens - vocab.special_bos_id = LLAMA_TOKEN_NULL; - vocab.special_eos_id = 1; - vocab.special_unk_id = 2; - vocab.special_sep_id = LLAMA_TOKEN_NULL; - vocab.special_pad_id = 0; - vocab.special_cls_id = LLAMA_TOKEN_NULL; - vocab.special_mask_id = LLAMA_TOKEN_NULL; - - const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); - if (precompiled_charsmap_keyidx != -1) { - size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); - const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); - vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap); -#ifdef IS_BIG_ENDIAN - // correct endiannes of data in precompiled_charsmap binary blob - uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0]; - *xcda_blob_size = __builtin_bswap32(*xcda_blob_size); - assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap); - size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t); - uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)]; - for (size_t i = 0; i < xcda_array_size; ++i) { - xcda_array[i] = __builtin_bswap32(xcda_array[i]); - } -#endif - } - } else if (tokenizer_model == "rwkv") { - vocab.type = LLAMA_VOCAB_TYPE_RWKV; - - // default special tokens - vocab.special_bos_id = LLAMA_TOKEN_NULL; - vocab.special_eos_id = LLAMA_TOKEN_NULL; - vocab.special_unk_id = LLAMA_TOKEN_NULL; - vocab.special_sep_id = LLAMA_TOKEN_NULL; - vocab.special_pad_id = LLAMA_TOKEN_NULL; - } else { - throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); - } - - // for now, only BPE models have pre-tokenizers - if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { - vocab.tokenizer_add_space_prefix = false; - vocab.tokenizer_clean_spaces = true; - if (tokenizer_pre.empty()) { - LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__); - LLAMA_LOG_WARN("%s: ************************************ \n", __func__); - LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__); - LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__); - LLAMA_LOG_WARN("%s: ************************************ \n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__); - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - } else if (tokenizer_pre == "default") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - } else if ( - tokenizer_pre == "llama3" || - tokenizer_pre == "llama-v3" || - tokenizer_pre == "llama-bpe"|| - tokenizer_pre == "falcon3") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; - vocab.tokenizer_ignore_merges = true; - vocab.tokenizer_add_bos = true; - } else if ( - tokenizer_pre == "deepseek-llm") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "deepseek-coder") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "deepseek-v3") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "falcon") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; - } else if ( - tokenizer_pre == "mpt") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT; - } else if ( - tokenizer_pre == "starcoder") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER; - } else if ( - tokenizer_pre == "gpt-2" || - tokenizer_pre == "phi-2" || - tokenizer_pre == "jina-es" || - tokenizer_pre == "jina-de" || - tokenizer_pre == "gigachat" || - tokenizer_pre == "jina-v1-en" || - tokenizer_pre == "jina-v2-es" || - tokenizer_pre == "jina-v2-de" || - tokenizer_pre == "jina-v2-code" || - tokenizer_pre == "roberta-bpe") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; - } else if ( - tokenizer_pre == "refact") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; - } else if ( - tokenizer_pre == "command-r") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "qwen2") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "stablelm2") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2; - } else if ( - tokenizer_pre == "olmo") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; - } else if ( - tokenizer_pre == "dbrx") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; - } else if ( - tokenizer_pre == "smaug-bpe") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; - } else if ( - tokenizer_pre == "poro-chat") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "chatglm-bpe") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; - vocab.special_bos_id = LLAMA_TOKEN_NULL; - } else if ( - tokenizer_pre == "viking") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "jais") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; - } else if ( - tokenizer_pre == "tekken") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN; - vocab.tokenizer_clean_spaces = false; - vocab.tokenizer_ignore_merges = true; - vocab.tokenizer_add_bos = true; - } else if ( - tokenizer_pre == "smollm") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "codeshell") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; - } else if ( - tokenizer_pre == "bloom") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM; - } else if ( - tokenizer_pre == "gpt3-finnish") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH; - } else if ( - tokenizer_pre == "exaone") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE; - } else if ( - tokenizer_pre == "chameleon") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; - vocab.tokenizer_add_bos = true; - vocab.tokenizer_clean_spaces = false; - } else if ( - tokenizer_pre == "minerva-7b") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA; - } else if ( - tokenizer_pre == "megrez") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; - } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); - } - } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - vocab.tokenizer_add_space_prefix = true; - vocab.tokenizer_clean_spaces = false; - vocab.tokenizer_add_bos = true; - vocab.tokenizer_add_eos = false; - } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - vocab.tokenizer_add_space_prefix = false; - vocab.tokenizer_clean_spaces = true; - vocab.tokenizer_add_bos = true; - vocab.tokenizer_add_eos = false; - } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - vocab.tokenizer_add_bos = false; - vocab.tokenizer_add_eos = true; - } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - vocab.tokenizer_add_space_prefix = false; - vocab.tokenizer_clean_spaces = false; - vocab.tokenizer_add_bos = false; - vocab.tokenizer_add_eos = false; - } else { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; - } - - ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false); - ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false); - } - - const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str()); - if (token_idx == -1) { - throw std::runtime_error("cannot find tokenizer vocab in model file\n"); - } - - const float * scores = nullptr; - const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str()); - if (score_idx != -1) { - scores = (const float * ) gguf_get_arr_data(ctx, score_idx); - } - - const int * toktypes = nullptr; - const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str()); - if (toktype_idx != -1) { - toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); - } - - const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); - - vocab.n_vocab = n_vocab; - vocab.id_to_token.resize(n_vocab); - - for (uint32_t i = 0; i < n_vocab; i++) { - std::string word = gguf_get_arr_str(ctx, token_idx, i); - if (word.empty()) { - LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i); - word = "[EMPTY_" + std::to_string(i) + "]"; - } - - vocab.token_to_id[word] = i; - vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size()); - - auto & token_data = vocab.id_to_token[i]; - token_data.text = std::move(word); - token_data.score = scores ? scores[i] : 0.0f; - token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; - - if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file - switch(toktypes[i]) { - case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; - case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break; - case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break; - case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break; - case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break; - case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break; - case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; - default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; - } - } - } - GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size()); - - vocab.init_tokenizer(); - - // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' - if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - try { - vocab.linefeed_id = vocab.byte_to_token('\n'); - } catch (const std::exception & e) { - LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what()); - vocab.linefeed_id = vocab.special_pad_id; - } - } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { - vocab.linefeed_id = vocab.special_pad_id; - } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) { - const std::vector ids = vocab.tokenize("\n", false); - GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); - vocab.linefeed_id = ids[0]; - } else { - const std::vector ids = vocab.tokenize("\xC4\x8A", false); // U+010A - - //GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); - if (ids.empty()) { - LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__); - vocab.linefeed_id = vocab.special_pad_id; - } else { - vocab.linefeed_id = ids[0]; - } - } - - // special tokens - { - const std::vector> special_token_types = { - { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, - { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, - { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, - { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, - { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, - { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, - { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, - { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id }, - { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id }, - { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id }, - { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id }, - { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id }, - { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id }, - { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id }, - { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id }, - - // deprecated - { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id }, - { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id }, - { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id }, - }; - - for (const auto & it : special_token_types) { - const std::string & key = kv(std::get<0>(it)); - int32_t & id = std::get<1>(it); - - uint32_t new_id; - if (!ml.get_key(std::get<0>(it), new_id, false)) { - continue; - } - if (new_id >= vocab.id_to_token.size()) { - LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n", - __func__, key.c_str(), new_id, id); - } else { - id = new_id; - } - } - - // Handle add_bos_token and add_eos_token - { - bool temp = true; - - if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) { - vocab.tokenizer_add_bos = temp; - } - if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) { - vocab.tokenizer_add_eos = temp; - } - } - - // auto-detect special tokens by text - // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_... - // for now, we apply this workaround to find the tokens based on their text - - for (const auto & t : vocab.token_to_id) { - // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc. - if (vocab.special_eot_id == LLAMA_TOKEN_NULL) { - if (false - || t.first == "<|eot_id|>" - || t.first == "<|im_end|>" - || t.first == "<|end|>" - || t.first == "" - || t.first == "<|endoftext|>" - || t.first == "" - || t.first == "<|end▁of▁sentence|>" // DeepSeek - ) { - vocab.special_eot_id = t.second; - if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.second, t.first.c_str()); - vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; - } - } - } - - // find EOM token: "<|eom_id|>" - if (vocab.special_eom_id == LLAMA_TOKEN_NULL) { - if (false - || t.first == "<|eom_id|>" - ) { - vocab.special_eom_id = t.second; - if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { - LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", - __func__, t.second, t.first.c_str()); - vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; - } - } - } - - // find FIM_PRE token: "<|fim_prefix|>", "", "
", etc.
-            if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
-                if (false
-                        || t.first == "<|fim_prefix|>"  // Qwen
-                        || t.first == ""
-                        || t.first == "<|fim▁begin|>" // DeepSeek
-                        || t.first == "
"
-                        ) {
-                    vocab.special_fim_pre_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
-                    }
-                }
-            }
-
-            // find FIM_SUF token: "<|fim_suffix|>", "", "", etc.
-            if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
-                if (false
-                        || t.first == "<|fim_suffix|>" // Qwen
-                        || t.first == ""
-                        || t.first == "<|fim▁hole|>" // DeepSeek
-                        || t.first == ""
-                        ) {
-                    vocab.special_fim_suf_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
-                    }
-                }
-            }
-
-            // find FIM_MID token: "<|fim_middle|>", "", "", etc.
-            if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
-                if (false
-                        || t.first == "<|fim_middle|>" // Qwen
-                        || t.first == ""
-                        || t.first == "<|fim▁end|>"  // DeepSeek
-                        || t.first == ""
-                        ) {
-                    vocab.special_fim_mid_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
-                    }
-                }
-            }
-
-            // find FIM_PAD token: "<|fim_pad|>", "", "", etc.
-            if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
-                if (false
-                        || t.first == "<|fim_pad|>" // Qwen
-                        || t.first == ""
-                        || t.first == ""
-                        ) {
-                    vocab.special_fim_pad_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
-                    }
-                }
-            }
-
-            // find FIM_REP token: "<|fim_repo|>", "", "", etc.
-            if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
-                if (false
-                        || t.first == "<|fim_repo|>"  // Qwen
-                        || t.first == "<|repo_name|>"
-                        || t.first == ""
-                        || t.first == ""
-                        ) {
-                    vocab.special_fim_rep_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
-                    }
-                }
-            }
-
-            // find FIM_SEP token: "<|file_sep|>"
-            if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
-                if (false
-                        || t.first == "<|file_sep|>" // Qwen
-                        ) {
-                    vocab.special_fim_sep_id = t.second;
-                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.second, t.first.c_str());
-                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
-                    }
-                }
-            }
-        }
-
-        // maintain a list of tokens that cause end-of-generation
-        // this is currently determined based on the token text, which is obviously not ideal
-        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
-        vocab.special_eog_ids.clear();
-
-        if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
-            vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
-        }
-
-        if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
-            vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
-        }
-
-        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
-            vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
-        }
-
-        for (const auto & t : vocab.token_to_id) {
-            if (false
-                    || t.first == "<|eot_id|>"
-                    || t.first == "<|im_end|>"
-                    || t.first == "<|end|>"
-                    || t.first == ""
-                    || t.first == "<|endoftext|>"
-                    || t.first == "<|eom_id|>"
-                    || t.first == ""
-               ) {
-                vocab.special_eog_ids.insert(t.second);
-                if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                            __func__, t.second, t.first.c_str());
-                    vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
-                }
-            } else {
-                // token is control, but not marked as EOG -> print a debug log
-                if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
-                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                            __func__, t.second, t.first.c_str());
-                }
-            }
-        }
-
-        // sanity checks
-        if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
-            vocab.special_eog_ids.insert(vocab.special_eos_id);
-            LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
-        }
-
-        if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
-            vocab.special_eog_ids.insert(vocab.special_eot_id);
-            LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
-        }
-
-        if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
-            vocab.special_eog_ids.insert(vocab.special_eom_id);
-            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
-        }
-    }
-
-    // build special tokens cache
-    {
-        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
-            if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
-                vocab.cache_special_tokens.push_back(id);
-            }
-        }
-
-        std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
-            [&] (const llama_vocab::id a, const llama_vocab::id b) {
-                return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
-            }
-        );
-
-        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
-    }
-
-    // build token to piece cache
-    {
-        size_t size_cache = 0;
-
-        std::vector cache_token_to_piece(n_vocab);
-
-        for (uint32_t id = 0; id < n_vocab; ++id) {
-            cache_token_to_piece[id] = token_to_piece(id, true);
-
-            size_cache += cache_token_to_piece[id].size();
-        }
-
-        std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
-
-        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
-    }
-
-    // Handle per token attributes
-    //NOTE: Each model customizes per token attributes.
-    //NOTE: Per token attributes are missing from the GGUF file.
-    //TODO: Extract attributes from GGUF file.
-    {
-        auto _contains_any = [] (const std::string & str, const std::vector & substrs) -> bool {
-            for (const auto & substr : substrs) {
-                if (str.find(substr) < std::string::npos) {
-                    return true;
-                }
-            }
-            return false;
-        };
-
-        auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
-            uint32_t current = vocab.id_to_token.at(id).attr;
-            current = value ? (current | attr) : (current & ~attr);
-            vocab.id_to_token[id].attr = (llama_token_attr) current;
-        };
-
-        auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
-            _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
-        };
-
-        std::string model_name;
-        std::string tokenizer_pre;
-
-        ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
-        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
-
-        // model name to lowercase
-        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
-            [] (const std::string::value_type x) {
-                return std::tolower(x);
-            }
-        );
-
-        // set attributes by model/tokenizer name
-        if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
-            _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true);
-        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
-            for (auto id : vocab.cache_special_tokens) {
-                _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
-            }
-            for (auto token : {""}) {
-                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
-            }
-            for (auto token : {"", "", "<|endoftext|>"}) {
-                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
-            }
-        }
-    }
+    vocab.load(ml, kv);
 }
 
 bool llama_model::load_tensors(llama_model_loader & ml) {
@@ -4441,23 +3735,6 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
     return it->second;
 }
 
-// NOTE: avoid ever using this except for building the token_to_piece caches
-std::string llama_model::token_to_piece(llama_token token, bool special) const {
-    std::string piece;
-    piece.resize(piece.capacity());  // using string internal cache
-    const int n_chars = llama_token_to_piece(this, token, &piece[0], piece.size(), 0, special);
-    if (n_chars < 0) {
-        piece.resize(-n_chars);
-        int check = llama_token_to_piece(this, token, &piece[0], piece.size(), 0, special);
-        GGML_ASSERT(check == -n_chars);
-    }
-    else {
-        piece.resize(n_chars);
-    }
-
-    return piece;
-}
-
 //
 // interface implementation
 //
diff --git a/src/llama-model.h b/src/llama-model.h
index badcdc71b..4cc8abb75 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -367,8 +367,6 @@ struct llama_model {
 private:
     struct impl;
     std::unique_ptr pimpl;
-
-    std::string token_to_piece(llama_token token, bool special) const;
 };
 
 const char * llm_type_name(llm_type type);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 5b975a3e7..25ff27dbe 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1,6 +1,7 @@
 #include "llama-vocab.h"
 
 #include "llama-impl.h"
+#include "llama-model-loader.h"
 
 #include "unicode.h"
 
@@ -62,7 +63,7 @@ struct naive_trie {
 };
 
 //
-// impl
+// tokenizers
 //
 
 struct llm_tokenizer {
@@ -70,109 +71,6 @@ struct llm_tokenizer {
     virtual ~llm_tokenizer() = default;
 };
 
-struct llama_vocab::impl {
-    std::unique_ptr tokenizer;
-};
-
-llama_vocab::llama_vocab() : pimpl(new impl()) {
-}
-
-llama_vocab::~llama_vocab() {
-}
-
-int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
-    GGML_ASSERT(token_left.find(' ')   == std::string::npos);
-    GGML_ASSERT(token_left.find('\n')  == std::string::npos);
-    GGML_ASSERT(token_right.find(' ')  == std::string::npos);
-    GGML_ASSERT(token_right.find('\n') == std::string::npos);
-
-    auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
-    if (it == bpe_ranks.end()) {
-        return -1;
-    }
-
-    return it->second;
-}
-
-enum llama_vocab_type llama_vocab::get_type() const {
-    return type;
-}
-
-std::string llama_vocab::type_name() const{
-    switch (type) {
-        case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
-        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
-        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
-        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
-        case LLAMA_VOCAB_TYPE_UGM:  return "UGM";
-        case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
-        default:                    return "unknown";
-    }
-}
-
-bool llama_vocab::is_normal(llama_token id) const {
-    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
-    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
-}
-
-bool llama_vocab::is_unknown(llama_token id) const {
-    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
-    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
-}
-
-bool llama_vocab::is_control(llama_token id) const {
-    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
-    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
-}
-
-bool llama_vocab::is_byte(llama_token id) const {
-    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
-    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
-}
-
-bool llama_vocab::is_user_defined(llama_token id) const {
-    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
-    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
-}
-
-bool llama_vocab::is_unused(llama_token id) const {
-    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
-    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
-}
-
-bool llama_vocab::is_eog(llama_token id) const {
-    return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
-}
-
-uint8_t llama_vocab::token_to_byte(llama_token id) const {
-    GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
-    GGML_ASSERT(is_byte(id));
-    const auto & token_data = id_to_token.at(id);
-    switch (get_type()) {
-        case LLAMA_VOCAB_TYPE_SPM:
-        case LLAMA_VOCAB_TYPE_UGM: {
-            auto buf = token_data.text.substr(3, 2);
-            return strtol(buf.c_str(), NULL, 16);
-        }
-        case LLAMA_VOCAB_TYPE_BPE: {
-            GGML_ABORT("fatal error");
-        }
-        case LLAMA_VOCAB_TYPE_WPM: {
-            GGML_ABORT("fatal error");
-        }
-        default:
-            GGML_ABORT("fatal error");
-    }
-}
-
-static void llama_escape_whitespace(std::string & text) {
-    replace_all(text, " ", "\xe2\x96\x81");
-}
-
-static void llama_unescape_whitespace(std::string & word) {
-    replace_all(word, "\xe2\x96\x81", " ");
-}
-
 struct llm_symbol {
     using index = int;
     index prev;
@@ -1264,28 +1162,859 @@ private:
     const llm_tokenizer_rwkv & tokenizer;
 };
 
-void llama_vocab::init_tokenizer() {
+//
+// impl
+//
+
+struct llama_vocab::impl {
+    std::unique_ptr tokenizer;
+
+    impl(const llama_vocab & vocab) : vocab(vocab) {
+    }
+
+    ~impl() {
+    }
+
+    void init_tokenizer(enum llama_vocab_type type);
+
+private:
+    const llama_vocab & vocab;
+};
+
+void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
+    LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
+
     switch (type) {
         case LLAMA_VOCAB_TYPE_SPM:
-            pimpl->tokenizer = std::make_unique(*this);
+            tokenizer = std::make_unique(vocab);
             break;
         case LLAMA_VOCAB_TYPE_BPE:
-            pimpl->tokenizer = std::make_unique(*this);
+            tokenizer = std::make_unique(vocab);
             break;
         case LLAMA_VOCAB_TYPE_WPM:
-            pimpl->tokenizer = std::make_unique(*this);
+            tokenizer = std::make_unique(vocab);
             break;
         case LLAMA_VOCAB_TYPE_UGM:
-            pimpl->tokenizer = std::make_unique(*this);
+            tokenizer = std::make_unique(vocab);
             break;
         case LLAMA_VOCAB_TYPE_RWKV:
-            pimpl->tokenizer = std::make_unique(*this);
+            tokenizer = std::make_unique(vocab);
             break;
         default:
             GGML_ABORT("unsupported vocab type");
     }
 }
 
+// NOTE: avoid ever using this except for building the token_to_piece caches
+std::string llama_vocab::token_to_piece_for_cache(llama_token token, bool special) const {
+    std::string piece;
+    piece.resize(piece.capacity());  // using string internal cache
+    const int n_chars = token_to_piece(token, &piece[0], piece.size(), 0, special);
+    if (n_chars < 0) {
+        piece.resize(-n_chars);
+        int check = token_to_piece(token, &piece[0], piece.size(), 0, special);
+        GGML_ASSERT(check == -n_chars);
+    }
+    else {
+        piece.resize(n_chars);
+    }
+
+    return piece;
+}
+
+llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
+}
+
+llama_vocab::~llama_vocab() {
+}
+
+void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
+    struct gguf_context * ctx = ml.meta.get();
+
+    // determine vocab type
+    {
+        std::string tokenizer_model;
+        std::string tokenizer_pre;
+
+        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
+        ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
+
+        if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
+            type = LLAMA_VOCAB_TYPE_NONE;
+
+            // default special tokens
+            special_bos_id  = LLAMA_TOKEN_NULL;
+            special_eos_id  = LLAMA_TOKEN_NULL;
+            special_unk_id  = LLAMA_TOKEN_NULL;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = LLAMA_TOKEN_NULL;
+            special_cls_id  = LLAMA_TOKEN_NULL;
+            special_mask_id = LLAMA_TOKEN_NULL;
+            linefeed_id     = LLAMA_TOKEN_NULL;
+
+            // read vocab size from metadata
+            if (!ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false)) {
+                n_vocab = 0;
+                LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, n_vocab will be set to %u\n", __func__, n_vocab);
+            }
+
+            return;
+        }
+
+        if (tokenizer_model == "llama") {
+            type = LLAMA_VOCAB_TYPE_SPM;
+
+            // default special tokens
+            special_bos_id  = 1;
+            special_eos_id  = 2;
+            special_unk_id  = 0;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = LLAMA_TOKEN_NULL;
+            special_cls_id  = LLAMA_TOKEN_NULL;
+            special_mask_id = LLAMA_TOKEN_NULL;
+        } else if (tokenizer_model == "bert") {
+            type = LLAMA_VOCAB_TYPE_WPM;
+
+            // default special tokens
+            special_bos_id  = LLAMA_TOKEN_NULL;
+            special_eos_id  = LLAMA_TOKEN_NULL;
+            special_unk_id  = 100;
+            special_sep_id  = 102;
+            special_pad_id  = 0;
+            special_cls_id  = 101;
+            special_mask_id = 103;
+        } else if (tokenizer_model == "gpt2") {
+            type = LLAMA_VOCAB_TYPE_BPE;
+
+            // read bpe merges and populate bpe ranks
+            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+            if (merges_keyidx == -1) {
+                throw std::runtime_error("cannot find tokenizer merges in model file\n");
+            }
+
+            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+            for (int i = 0; i < n_merges; i++) {
+                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+                //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
+
+                std::string first;
+                std::string second;
+
+                const size_t pos = word.find(' ', 1);
+
+                if (pos != std::string::npos) {
+                    first  = word.substr(0, pos);
+                    second = word.substr(pos + 1);
+                }
+
+                bpe_ranks.emplace(std::make_pair(first, second), i);
+            }
+
+            // default special tokens
+            special_bos_id  = 11;
+            special_eos_id  = 11;
+            special_unk_id  = LLAMA_TOKEN_NULL;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = LLAMA_TOKEN_NULL;
+            special_cls_id  = LLAMA_TOKEN_NULL;
+            special_mask_id = LLAMA_TOKEN_NULL;
+        } else if (tokenizer_model == "t5") {
+            type = LLAMA_VOCAB_TYPE_UGM;
+
+            // default special tokens
+            special_bos_id  = LLAMA_TOKEN_NULL;
+            special_eos_id  = 1;
+            special_unk_id  = 2;
+            special_sep_id  = LLAMA_TOKEN_NULL;
+            special_pad_id  = 0;
+            special_cls_id  = LLAMA_TOKEN_NULL;
+            special_mask_id = LLAMA_TOKEN_NULL;
+
+            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+            if (precompiled_charsmap_keyidx != -1) {
+                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
+                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
+#ifdef IS_BIG_ENDIAN
+                // correct endiannes of data in precompiled_charsmap binary blob
+                uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
+                *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
+                assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
+                size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
+                uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
+                for (size_t i = 0; i < xcda_array_size; ++i) {
+                    xcda_array[i] = __builtin_bswap32(xcda_array[i]);
+                }
+#endif
+            }
+        } else if (tokenizer_model == "rwkv") {
+            type = LLAMA_VOCAB_TYPE_RWKV;
+
+            // default special tokens
+            special_bos_id = LLAMA_TOKEN_NULL;
+            special_eos_id = LLAMA_TOKEN_NULL;
+            special_unk_id = LLAMA_TOKEN_NULL;
+            special_sep_id = LLAMA_TOKEN_NULL;
+            special_pad_id = LLAMA_TOKEN_NULL;
+        } else {
+            throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
+        }
+
+        // for now, only BPE models have pre-tokenizers
+        if (type == LLAMA_VOCAB_TYPE_BPE) {
+            tokenizer_add_space_prefix = false;
+            tokenizer_clean_spaces = true;
+            if (tokenizer_pre.empty()) {
+                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+                LLAMA_LOG_WARN("%s:                                             \n", __func__);
+                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
+                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
+                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
+                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
+                LLAMA_LOG_WARN("%s:                                             \n", __func__);
+                type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (tokenizer_pre == "default") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (
+                    tokenizer_pre == "llama3"   ||
+                    tokenizer_pre == "llama-v3" ||
+                    tokenizer_pre == "llama-bpe"||
+                    tokenizer_pre == "falcon3") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+                tokenizer_ignore_merges = true;
+                tokenizer_add_bos = true;
+            } else if (
+                    tokenizer_pre == "deepseek-llm") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+                tokenizer_clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "deepseek-coder") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+                tokenizer_clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "deepseek-v3") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
+                tokenizer_clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "falcon") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
+            } else if (
+                    tokenizer_pre == "mpt") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
+            } else if (
+                    tokenizer_pre == "starcoder") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
+            } else if (
+                    tokenizer_pre == "gpt-2"   ||
+                    tokenizer_pre == "phi-2"   ||
+                    tokenizer_pre == "jina-es" ||
+                    tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "gigachat"   ||
+                    tokenizer_pre == "jina-v1-en" ||
+                    tokenizer_pre == "jina-v2-es" ||
+                    tokenizer_pre == "jina-v2-de" ||
+                    tokenizer_pre == "jina-v2-code" ||
+                    tokenizer_pre == "roberta-bpe") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "refact") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
+            } else if (
+                tokenizer_pre == "command-r") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+                tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "qwen2") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+                tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "stablelm2") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
+            } else if (
+                tokenizer_pre == "olmo") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
+            } else if (
+                tokenizer_pre == "dbrx") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
+            } else if (
+                tokenizer_pre == "smaug-bpe") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+            } else if (
+                tokenizer_pre == "poro-chat") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
+                tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "chatglm-bpe") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
+                special_bos_id = LLAMA_TOKEN_NULL;
+            } else if (
+                tokenizer_pre == "viking") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
+                tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "jais") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+            } else if (
+                tokenizer_pre == "tekken") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+                tokenizer_clean_spaces = false;
+                tokenizer_ignore_merges = true;
+                tokenizer_add_bos = true;
+            } else if (
+                tokenizer_pre == "smollm") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "codeshell") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+            } else if (
+                tokenizer_pre == "bloom") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
+            } else if (
+                tokenizer_pre == "gpt3-finnish") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
+            } else if (
+                tokenizer_pre == "exaone") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+            } else if (
+                tokenizer_pre == "chameleon") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
+                tokenizer_add_bos = true;
+                tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "minerva-7b") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
+            } else if (
+                tokenizer_pre == "megrez") {
+                type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+            } else {
+                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+            }
+        } else if (type == LLAMA_VOCAB_TYPE_SPM) {
+            type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            tokenizer_add_space_prefix = true;
+            tokenizer_clean_spaces = false;
+            tokenizer_add_bos = true;
+            tokenizer_add_eos = false;
+        } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+            type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            tokenizer_add_space_prefix = false;
+            tokenizer_clean_spaces = true;
+            tokenizer_add_bos = true;
+            tokenizer_add_eos = false;
+        } else if (type == LLAMA_VOCAB_TYPE_UGM) {
+            type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            tokenizer_add_bos = false;
+            tokenizer_add_eos = true;
+        } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+            type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            tokenizer_add_space_prefix = false;
+            tokenizer_clean_spaces = false;
+            tokenizer_add_bos = false;
+            tokenizer_add_eos = false;
+        } else {
+            type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+        }
+
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      tokenizer_add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, tokenizer_remove_extra_whitespaces, false);
+    }
+
+    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
+    if (token_idx == -1) {
+        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+    }
+
+    const float * scores = nullptr;
+    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
+    if (score_idx != -1) {
+        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+    }
+
+    const int * toktypes = nullptr;
+    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
+    if (toktype_idx != -1) {
+        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+    }
+
+    n_vocab = gguf_get_arr_n(ctx, token_idx);
+    id_to_token.resize(n_vocab);
+
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        std::string word = gguf_get_arr_str(ctx, token_idx, i);
+        if (word.empty()) {
+            LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
+            word = "[EMPTY_" + std::to_string(i) + "]";
+        }
+
+        token_to_id[word] = i;
+        max_token_len = std::max(max_token_len, (int) word.size());
+
+        auto & token_data = id_to_token[i];
+        token_data.text  = std::move(word);
+        token_data.score = scores ? scores[i] : 0.0f;
+        token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;
+
+        if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
+            switch(toktypes[i]) {
+                case LLAMA_TOKEN_TYPE_UNKNOWN:      token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN;      break;
+                case LLAMA_TOKEN_TYPE_UNUSED:       token_data.attr = LLAMA_TOKEN_ATTR_UNUSED;       break;
+                case LLAMA_TOKEN_TYPE_NORMAL:       token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;       break;
+                case LLAMA_TOKEN_TYPE_CONTROL:      token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;      break;
+                case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
+                case LLAMA_TOKEN_TYPE_BYTE:         token_data.attr = LLAMA_TOKEN_ATTR_BYTE;         break;
+                case LLAMA_TOKEN_TYPE_UNDEFINED:    token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
+                default:                            token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
+            }
+        }
+    }
+    GGML_ASSERT(id_to_token.size() == token_to_id.size());
+
+    pimpl->init_tokenizer(type);
+
+    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
+    if (type == LLAMA_VOCAB_TYPE_SPM) {
+        try {
+            linefeed_id = byte_to_token('\n');
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+            linefeed_id = special_pad_id;
+        }
+    } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+        linefeed_id = special_pad_id;
+    } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+        const std::vector ids = tokenize("\n", false);
+        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        linefeed_id = ids[0];
+    } else {
+        const std::vector ids = tokenize("\xC4\x8A", false); // U+010A
+
+        //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+        if (ids.empty()) {
+            LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
+            linefeed_id = special_pad_id;
+        } else {
+            linefeed_id = ids[0];
+        }
+    }
+
+    // special tokens
+    {
+        const std::vector> special_token_types = {
+            { LLM_KV_TOKENIZER_BOS_ID,     special_bos_id     },
+            { LLM_KV_TOKENIZER_EOS_ID,     special_eos_id     },
+            { LLM_KV_TOKENIZER_EOT_ID,     special_eot_id     },
+            { LLM_KV_TOKENIZER_EOM_ID,     special_eom_id     },
+            { LLM_KV_TOKENIZER_UNK_ID,     special_unk_id     },
+            { LLM_KV_TOKENIZER_SEP_ID,     special_sep_id     },
+            { LLM_KV_TOKENIZER_PAD_ID,     special_pad_id     },
+            { LLM_KV_TOKENIZER_CLS_ID,     special_cls_id     },
+            { LLM_KV_TOKENIZER_MASK_ID,    special_mask_id    },
+            { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
+            { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
+            { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
+            { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
+            { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
+            { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
+
+            // deprecated
+            { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
+            { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
+            { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
+        };
+
+        for (const auto & it : special_token_types) {
+            const std::string & key = kv(std::get<0>(it));
+            int32_t & id = std::get<1>(it);
+
+            uint32_t new_id;
+            if (!ml.get_key(std::get<0>(it), new_id, false)) {
+                continue;
+            }
+            if (new_id >= id_to_token.size()) {
+                LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
+                    __func__, key.c_str(), new_id, id);
+            } else {
+                id = new_id;
+            }
+        }
+
+        // Handle add_bos_token and add_eos_token
+        {
+            bool temp = true;
+
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
+                tokenizer_add_bos = temp;
+            }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
+                tokenizer_add_eos = temp;
+            }
+        }
+
+        // auto-detect special tokens by text
+        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+        //       for now, we apply this workaround to find the tokens based on their text
+
+        for (const auto & t : token_to_id) {
+            // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
+            if (special_eot_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eot_id|>"
+                        || t.first == "<|im_end|>"
+                        || t.first == "<|end|>"
+                        || t.first == ""
+                        || t.first == "<|endoftext|>"
+                        || t.first == ""
+                        || t.first == "<|end▁of▁sentence|>" // DeepSeek
+                   ) {
+                    special_eot_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find EOM token: "<|eom_id|>"
+            if (special_eom_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eom_id|>"
+                        ) {
+                    special_eom_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PRE token: "<|fim_prefix|>", "", "
", etc.
+            if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_prefix|>"  // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁begin|>" // DeepSeek
+                        || t.first == "
"
+                        ) {
+                    special_fim_pre_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SUF token: "<|fim_suffix|>", "", "", etc.
+            if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_suffix|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁hole|>" // DeepSeek
+                        || t.first == ""
+                        ) {
+                    special_fim_suf_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_MID token: "<|fim_middle|>", "", "", etc.
+            if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_middle|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁end|>"  // DeepSeek
+                        || t.first == ""
+                        ) {
+                    special_fim_mid_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PAD token: "<|fim_pad|>", "", "", etc.
+            if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_pad|>" // Qwen
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    special_fim_pad_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_REP token: "<|fim_repo|>", "", "", etc.
+            if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_repo|>"  // Qwen
+                        || t.first == "<|repo_name|>"
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    special_fim_rep_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SEP token: "<|file_sep|>"
+            if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|file_sep|>" // Qwen
+                        ) {
+                    special_fim_sep_id = t.second;
+                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+        }
+
+        // maintain a list of tokens that cause end-of-generation
+        // this is currently determined based on the token text, which is obviously not ideal
+        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
+        special_eog_ids.clear();
+
+        if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
+            special_eog_ids.insert(special_fim_pad_id);
+        }
+
+        if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
+            special_eog_ids.insert(special_fim_rep_id);
+        }
+
+        if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
+            special_eog_ids.insert(special_fim_sep_id);
+        }
+
+        for (const auto & t : token_to_id) {
+            if (false
+                    || t.first == "<|eot_id|>"
+                    || t.first == "<|im_end|>"
+                    || t.first == "<|end|>"
+                    || t.first == ""
+                    || t.first == "<|endoftext|>"
+                    || t.first == "<|eom_id|>"
+                    || t.first == ""
+               ) {
+                special_eog_ids.insert(t.second);
+                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.second, t.first.c_str());
+                    id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                }
+            } else {
+                // token is control, but not marked as EOG -> print a debug log
+                if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
+                }
+            }
+        }
+
+        // sanity checks
+        if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
+            special_eog_ids.insert(special_eos_id);
+            LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+
+        if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
+            special_eog_ids.insert(special_eot_id);
+            LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+
+        if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
+            special_eog_ids.insert(special_eom_id);
+            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+        }
+    }
+
+    // build special tokens cache
+    {
+        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
+            if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
+                cache_special_tokens.push_back(id);
+            }
+        }
+
+        std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
+            [&] (const llama_vocab::id a, const llama_vocab::id b) {
+                return id_to_token[a].text.size() > id_to_token[b].text.size();
+            }
+        );
+
+        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
+    }
+
+    // build token to piece cache
+    {
+        size_t size_cache = 0;
+
+        std::vector cache(n_vocab);
+
+        for (uint32_t id = 0; id < n_vocab; ++id) {
+            cache[id] = token_to_piece_for_cache(id, true);
+
+            size_cache += cache[id].size();
+        }
+
+        std::swap(cache_token_to_piece, cache);
+
+        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
+    }
+
+    // Handle per token attributes
+    //NOTE: Each model customizes per token attributes.
+    //NOTE: Per token attributes are missing from the GGUF file.
+    //TODO: Extract attributes from GGUF file.
+    {
+        auto _contains_any = [] (const std::string & str, const std::vector & substrs) -> bool {
+            for (const auto & substr : substrs) {
+                if (str.find(substr) < std::string::npos) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
+            uint32_t current = id_to_token.at(id).attr;
+            current = value ? (current | attr) : (current & ~attr);
+            id_to_token[id].attr = (llama_token_attr) current;
+        };
+
+        auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
+            _set_tokenid_attr(token_to_id.at(token), attr, value);
+        };
+
+        std::string model_name;
+        std::string tokenizer_pre;
+
+        ml.get_key(LLM_KV_GENERAL_NAME,  model_name,    false);
+        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+
+        // model name to lowercase
+        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
+            [] (const std::string::value_type x) {
+                return std::tolower(x);
+            }
+        );
+
+        // set attributes by model/tokenizer name
+        if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
+            _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true);
+        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
+            for (auto id : cache_special_tokens) {
+                _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
+            }
+            for (auto token : {""}) {
+                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
+            }
+            for (auto token : {"", "", "<|endoftext|>"}) {
+                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
+            }
+        }
+    }
+}
+
+enum llama_vocab_type llama_vocab::get_type() const {
+    return type;
+}
+
+std::string llama_vocab::type_name() const{
+    switch (type) {
+        case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
+        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
+        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
+        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
+        case LLAMA_VOCAB_TYPE_UGM:  return "UGM";
+        case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
+        default:                    return "unknown";
+    }
+}
+
+bool llama_vocab::is_normal(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
+}
+
+bool llama_vocab::is_unknown(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
+}
+
+bool llama_vocab::is_control(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
+}
+
+bool llama_vocab::is_byte(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
+}
+
+bool llama_vocab::is_user_defined(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
+}
+
+bool llama_vocab::is_unused(llama_token id) const {
+    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+}
+
+bool llama_vocab::is_eog(llama_token id) const {
+    return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
+}
+
+uint8_t llama_vocab::token_to_byte(llama_token id) const {
+    GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
+    GGML_ASSERT(is_byte(id));
+    const auto & token_data = id_to_token.at(id);
+    switch (get_type()) {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
+            auto buf = token_data.text.substr(3, 2);
+            return strtol(buf.c_str(), NULL, 16);
+        }
+        case LLAMA_VOCAB_TYPE_BPE: {
+            GGML_ABORT("fatal error");
+        }
+        case LLAMA_VOCAB_TYPE_WPM: {
+            GGML_ABORT("fatal error");
+        }
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+static void llama_escape_whitespace(std::string & text) {
+    replace_all(text, " ", "\xe2\x96\x81");
+}
+
+static void llama_unescape_whitespace(std::string & word) {
+    replace_all(word, "\xe2\x96\x81", " ");
+}
+
 //
 // (de-) tokenize
 //
@@ -1561,6 +2290,20 @@ llama_token llama_vocab::token_fim_sep() const {
     return special_fim_sep_id;
 }
 
+int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
+    GGML_ASSERT(token_left.find(' ')   == std::string::npos);
+    GGML_ASSERT(token_left.find('\n')  == std::string::npos);
+    GGML_ASSERT(token_right.find(' ')  == std::string::npos);
+    GGML_ASSERT(token_right.find('\n') == std::string::npos);
+
+    auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+    if (it == bpe_ranks.end()) {
+        return -1;
+    }
+
+    return it->second;
+}
+
 std::vector llama_vocab::tokenize(
         std::string raw_text,
         bool add_special,
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 3a79cf5b4..f1c735dba 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -9,6 +9,9 @@
 #include 
 #include 
 
+struct LLM_KV;
+struct llama_model_loader;
+
 struct llama_vocab {
     using id    = llama_token;
     using token = std::string;
@@ -75,9 +78,7 @@ struct llama_vocab {
     llama_vocab();
     ~llama_vocab();
 
-    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
-
-    void init_tokenizer();
+    void load(llama_model_loader & ml, const LLM_KV & kv);
 
     enum llama_vocab_type get_type() const;
 
@@ -121,6 +122,8 @@ struct llama_vocab {
     bool add_bos_token() const;
     bool add_eos_token() const;
 
+    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+
     std::vector tokenize(
                   std::string   raw_text,
                          bool   add_special,
@@ -162,4 +165,8 @@ struct llama_vocab {
 private:
     struct impl;
     std::unique_ptr pimpl;
+
+    std::string token_to_piece_for_cache(
+                  llama_token   token,
+                         bool   special) const;
 };