From cbea4ba102821c72aefb7e9831ea8c1b6af60476 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 10:52:17 +0200
Subject: [PATCH] vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174)

ggml-ci
---
 common/common.cpp                    |  2 +-
 common/sampling.cpp                  |  6 ++---
 common/speculative.cpp               |  6 ++---
 examples/imatrix/imatrix.cpp         |  2 +-
 examples/llama-bench/llama-bench.cpp |  4 ++--
 examples/lookahead/lookahead.cpp     |  2 +-
 examples/perplexity/perplexity.cpp   | 14 +++++------
 examples/server/server.cpp           |  6 ++---
 examples/server/utils.hpp            |  2 +-
 examples/speculative/speculative.cpp |  8 +++----
 include/llama.h                      |  4 ++--
 src/llama-context.cpp                |  8 +++----
 src/llama-model.cpp                  |  2 +-
 src/llama-sampling.cpp               |  4 ++--
 src/llama-vocab.cpp                  | 35 ++++++++++++++--------------
 src/llama-vocab.h                    |  2 +-
 src/llama.cpp                        |  6 ++---
 tests/test-tokenizer-1-bpe.cpp       |  2 +-
 tests/test-tokenizer-1-spm.cpp       |  2 +-
 19 files changed, 58 insertions(+), 59 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 07ac7ccad..39bfb0c2e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -950,7 +950,7 @@ struct common_init_result common_init_from_params(common_params & params) {
     }
 
     if (params.sampling.ignore_eos) {
-        for (llama_token i = 0; i < llama_vocab_n_vocab(vocab); i++) {
+        for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
             if (llama_vocab_is_eog(vocab, i)) {
                 LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
                 params.sampling.logit_bias.push_back({i, -INFINITY});
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 1d2c1815e..7241ac321 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -116,7 +116,7 @@ struct common_sampler {
         const llama_model * model = llama_get_model(ctx);
         const llama_vocab * vocab = llama_model_get_vocab(model);
 
-        const int n_vocab = llama_vocab_n_vocab(vocab);
+        const int n_vocab = llama_vocab_n_tokens(vocab);
 
         cur.resize(n_vocab);
 
@@ -162,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
     llama_sampler_chain_add(result->chain,
             llama_sampler_init_logit_bias(
-                llama_vocab_n_vocab(vocab),
+                llama_vocab_n_tokens(vocab),
                 params.logit_bias.size(),
                 params.logit_bias.data()));
 
@@ -211,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
         llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
     } else if (params.mirostat == 1) {
         llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_vocab(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
     } else if (params.mirostat == 2) {
         llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
         llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
diff --git a/common/speculative.cpp b/common/speculative.cpp
index ce8d179e1..318e96ea3 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -105,15 +105,15 @@ bool common_speculative_are_compatible(
     }
 
     {
-        const int n_vocab_tgt = llama_vocab_n_vocab(vocab_tgt);
-        const int n_vocab_dft = llama_vocab_n_vocab(vocab_dft);
+        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
+        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
 
         const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
 
         if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
             LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
                          "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    __func__, n_vocab_tgt, llama_vocab_n_vocab(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+                    __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
             return false;
         }
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 4e69849b7..b5f3feb9f 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -470,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const int n_chunk_max = tokens.size() / n_ctx;
 
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
     const int n_batch = params.n_batch;
 
     int count = 0;
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 7b639f410..a3b4c5ac8 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1402,7 +1402,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
 
     const llama_model * model   = llama_get_model(ctx);
     const llama_vocab * vocab   = llama_model_get_vocab(model);
-    const int32_t       n_vocab = llama_vocab_n_vocab(vocab);
+    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
 
     std::vector<llama_token> tokens(n_batch);
 
@@ -1426,7 +1426,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
 
     const llama_model * model   = llama_get_model(ctx);
     const llama_vocab * vocab   = llama_model_get_vocab(model);
-    const int32_t       n_vocab = llama_vocab_n_vocab(vocab);
+    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
 
     llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index e23060e24..2f0898e62 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -149,7 +149,7 @@ int main(int argc, char ** argv) {
     }
 
     // here we keep adding new n-grams as we go
-    ngram_container ngrams_observed(llama_vocab_n_vocab(vocab), N, G);
+    ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
 
     // debug
     struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index c594cdb5a..9bf6c5743 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -341,7 +341,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     int count = 0;
     double nll = 0.0;
@@ -491,7 +491,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     int count = 0;
     double nll = 0.0;
@@ -857,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     const int max_tasks_per_batch = 32;
     const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1141,7 +1141,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     const int max_tasks_per_batch = 128;
     const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1495,7 +1495,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
     const int n_ctx   = llama_n_ctx(ctx);
     const int n_batch = params.n_batch;
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     const int max_tasks_per_batch = 32;
     const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@@ -1704,8 +1704,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
         return;
     }
-    if (n_vocab != llama_vocab_n_vocab(vocab)) {
-        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_vocab(vocab));
+    if (n_vocab != llama_vocab_n_tokens(vocab)) {
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
     }
 
     std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1374440a3..64c0c4ef6 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -331,7 +331,7 @@ struct server_task {
 
             const auto & logit_bias = data.find("logit_bias");
             if (logit_bias != data.end() && logit_bias->is_array()) {
-                const int n_vocab = llama_vocab_n_vocab(vocab);
+                const int n_vocab = llama_vocab_n_tokens(vocab);
                 for (const auto & el : *logit_bias) {
                     // TODO: we may want to throw errors here, in case "el" is incorrect
                     if (el.is_array() && el.size() == 2) {
@@ -2081,7 +2081,7 @@ struct server_context {
 
     void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
         size_t n_probs = slot.params.sampling.n_probs;
-        size_t n_vocab = llama_vocab_n_vocab(vocab);
+        size_t n_vocab = llama_vocab_n_tokens(vocab);
         if (post_sampling) {
             const auto * cur_p = common_sampler_get_candidates(slot.smpl);
             const size_t max_probs = cur_p->size;
@@ -3137,7 +3137,7 @@ struct server_context {
     json model_meta() const {
         return json {
             {"vocab_type",  llama_vocab_type       (vocab)},
-            {"n_vocab",     llama_vocab_n_vocab    (vocab)},
+            {"n_vocab",     llama_vocab_n_tokens   (vocab)},
             {"n_ctx_train", llama_model_n_ctx_train(model)},
             {"n_embd",      llama_model_n_embd     (model)},
             {"n_params",    llama_model_n_params   (model)},
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 0f75b1ea3..699480f90 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -776,7 +776,7 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     cur.resize(n_vocab);
     for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index a85342fc1..c7ccea50d 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -116,8 +116,8 @@ int main(int argc, char ** argv) {
     }
 
     {
-        const int n_vocab_tgt = llama_vocab_n_vocab(vocab_tgt);
-        const int n_vocab_dft = llama_vocab_n_vocab(vocab_dft);
+        const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
+        const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
         const int vocab_diff  = n_vocab_tgt > n_vocab_dft
             ? n_vocab_tgt - n_vocab_dft
             : n_vocab_dft - n_vocab_tgt;
@@ -125,7 +125,7 @@ int main(int argc, char ** argv) {
         if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
             LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
             LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
-                    n_vocab_tgt, llama_vocab_n_vocab(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
+                    n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
             return 1;
         }
 
@@ -173,7 +173,7 @@ int main(int argc, char ** argv) {
     const auto t_enc_end = ggml_time_us();
 
     // the 2 models should have the same vocab
-    //GGML_ASSERT(n_vocab == llama_vocab_n_vocab(model_dft));
+    //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));
 
     // how many tokens to draft each time
     int n_draft = params.speculative.n_max;
diff --git a/include/llama.h b/include/llama.h
index 4685e21bf..9f04bc622 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -458,7 +458,7 @@ extern "C" {
     DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
     DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
 
-    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_vocab instead");
+    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
     LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
@@ -476,7 +476,7 @@ extern "C" {
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
 
-    LLAMA_API int32_t llama_vocab_n_vocab(const struct llama_vocab * vocab);
+    LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
 
     // Functions to access the model's GGUF metadata scalar values
     // - The functions return the length of the string on success, or -1 on failure
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 3cb06ee40..671d2a81a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
     const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
 
     const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_vocab();
+    const auto n_vocab = vocab.n_tokens();
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@@ -542,7 +542,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
 void llama_output_reorder(struct llama_context & ctx) {
     std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
     if (!out_ids.empty()) {
-        const uint32_t n_vocab = ctx.model.vocab.n_vocab();
+        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
         const uint32_t n_embd  = ctx.model.hparams.n_embd;
 
         const int32_t n_outputs = ctx.n_outputs;
@@ -726,7 +726,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
             throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
         }
 
-        return ctx->logits + j*ctx->model.vocab.n_vocab();
+        return ctx->logits + j*ctx->model.vocab.n_tokens();
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
@@ -886,7 +886,7 @@ struct llama_data_write {
     }
 
     void write_logits(const struct llama_context * ctx) {
-        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_vocab());
+        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
 
         write(&logits_size, sizeof(logits_size));
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 27afbba03..f90f5e746 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1367,7 +1367,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         const int64_t n_embd_head_v = hparams.n_embd_head_v;
         const int64_t n_ff          = hparams.n_ff();
         const int64_t n_embd_gqa    = n_embd_v_gqa;
-        const int64_t n_vocab       = vocab.n_vocab();
+        const int64_t n_vocab       = vocab.n_tokens();
         const int64_t n_token_types = vocab.n_token_types();
         const int64_t n_rot         = hparams.n_rot;
         const int64_t n_expert      = hparams.n_expert;
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 8775cf5e3..b3a12386e 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -374,7 +374,7 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     // TODO: do not allocate each time
     std::vector<llama_token_data> cur;
@@ -1666,7 +1666,7 @@ struct llama_sampler_dry {
 
 // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
 static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
-    for (llama_token token_id = 0; token_id < (llama_token) vocab.n_vocab(); token_id++) {
+    for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
         std::string word = vocab.detokenize({token_id}, true);
         if (word.find(str) != std::string::npos) {
             token_sequences.emplace(token_id, std::vector<llama_token>());
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index ceff5ba7e..ed8751737 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -206,7 +206,7 @@ private:
             return;
         }
 
-        if (static_cast<uint32_t>(token) >= vocab.n_vocab()) {
+        if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
             return;
         }
 
@@ -732,7 +732,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
             prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
         }
 
-        for (uint32_t id = 0; id < vocab.n_vocab(); ++id) {
+        for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
             const auto & token_data = vocab.get_token_data(id);
 
             if (vocab.is_normal(id)) {
@@ -1117,7 +1117,7 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
         // For now, we decode the vocab here into the lookup we'll use for tokenization.
 
         // build trie
-        for (uint32_t id = 0; id < vocab.n_vocab(); ++id) {
+        for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
             const auto & data = vocab.get_token_data(id);
             const auto text = llama_unescape_rwkv_token(data.text);
             token_matcher.insert((const char *) text.data(), text.size(), id);
@@ -1202,7 +1202,6 @@ struct fragment_buffer_variant {
 };
 
 struct llama_vocab::impl {
-    uint32_t n_vocab = 0;
     uint32_t n_token_types = 0; // for BERT-style token types
 
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
@@ -1358,9 +1357,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             linefeed_id     = LLAMA_TOKEN_NULL;
 
             // read vocab size from metadata
-            if (!ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false)) {
-                n_vocab = 0;
-                LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, n_vocab will be set to %u\n", __func__, n_vocab);
+            uint32_t n_tokens = 0;
+            if (!ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
+                LLAMA_LOG_WARN("%s: there is no vocab_size in metadata\n", __func__);
             }
 
             return;
@@ -1642,10 +1641,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
     }
 
-    n_vocab = gguf_get_arr_n(ctx, token_idx);
-    id_to_token.resize(n_vocab);
+    uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
+    id_to_token.resize(n_tokens);
 
-    for (uint32_t i = 0; i < n_vocab; i++) {
+    for (uint32_t i = 0; i < n_tokens; i++) {
         std::string word = gguf_get_arr_str(ctx, token_idx, i);
         if (word.empty()) {
             LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
@@ -1955,7 +1954,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
     // build special tokens cache
     {
-        for (llama_token id = 0; id < (llama_token) n_vocab; ++id) {
+        for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
             if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
                 cache_special_tokens.push_back(id);
             }
@@ -1974,9 +1973,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     {
         size_t size_cache = 0;
 
-        std::vector<std::string> cache(n_vocab);
+        std::vector<std::string> cache(n_tokens);
 
-        for (uint32_t id = 0; id < n_vocab; ++id) {
+        for (uint32_t id = 0; id < n_tokens; ++id) {
             cache[id] = token_to_piece_for_cache(id, true);
 
             size_cache += cache[id].size();
@@ -2690,7 +2689,7 @@ int32_t llama_vocab::impl::detokenize(
 
 void llama_vocab::impl::print_info() const {
     LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, type_name().c_str());
-    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, n_vocab);
+    LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, vocab.n_tokens());
     LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());
 
     // special tokens
@@ -2738,7 +2737,7 @@ enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
     return pimpl->pre_type;
 }
 
-uint32_t llama_vocab::n_vocab() const {
+uint32_t llama_vocab::n_tokens() const {
     return (uint32_t) pimpl->id_to_token.size();
 }
 
@@ -3025,13 +3024,13 @@ void llama_vocab::print_info() const {
 // interface implementation
 //
 
-int32_t llama_vocab_n_vocab(const struct llama_vocab * vocab) {
-    return vocab->n_vocab();
+int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
+    return vocab->n_tokens();
 }
 
 // deprecated
 int32_t llama_n_vocab(const struct llama_vocab * vocab) {
-    return llama_vocab_n_vocab(vocab);
+    return llama_vocab_n_tokens(vocab);
 }
 
 enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index bbc0a237d..020f2b533 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -24,7 +24,7 @@ struct llama_vocab {
     enum llama_vocab_type     get_type()     const;
     enum llama_vocab_pre_type get_pre_type() const;
 
-    uint32_t n_vocab() const;
+    uint32_t n_tokens() const;
     uint32_t n_token_types() const;
 
     std::string type_name() const;
diff --git a/src/llama.cpp b/src/llama.cpp
index fa8dff09d..daf1b7c97 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8470,7 +8470,7 @@ static int llama_decode_impl(
 
     if (batch.token) {
         for (uint32_t i = 0; i < n_tokens_all; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_vocab()) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
                 LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
             }
@@ -8490,7 +8490,7 @@ static int llama_decode_impl(
     llama_kv_slot_restorer kv_slot_restorer(kv_self);
 
     const int64_t n_embd  = hparams.n_embd;
-    const int64_t n_vocab = vocab.n_vocab();
+    const int64_t n_vocab = vocab.n_tokens();
 
     uint32_t n_outputs = 0;
     uint32_t n_outputs_prev = 0;
@@ -8805,7 +8805,7 @@ static int llama_encode_impl(
 
     if (batch.token) {
         for (uint32_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_vocab()) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
                 LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
                 return -1;
             }
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 9360a061e..55425d88a 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -77,7 +77,7 @@ int main(int argc, char **argv) {
     atexit([]() { console::cleanup(); });
 #endif
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     for (int i = 0; i < n_vocab; ++i) {
         std::string str = common_detokenize(ctx, std::vector<int>(1, i));
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp
index da84308f2..9e7b77f31 100644
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
     atexit([]() { console::cleanup(); });
 #endif
 
-    const int n_vocab = llama_vocab_n_vocab(vocab);
+    const int n_vocab = llama_vocab_n_tokens(vocab);
 
     for (int i = 0; i < n_vocab; ++i) {
         std::string str = common_detokenize(ctx, std::vector<int>(1, i), true);