From cbea4ba102821c72aefb7e9831ea8c1b6af60476 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 10:52:17 +0200 Subject: [PATCH] vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --- common/common.cpp | 2 +- common/sampling.cpp | 6 ++--- common/speculative.cpp | 6 ++--- examples/imatrix/imatrix.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 4 ++-- examples/lookahead/lookahead.cpp | 2 +- examples/perplexity/perplexity.cpp | 14 +++++------ examples/server/server.cpp | 6 ++--- examples/server/utils.hpp | 2 +- examples/speculative/speculative.cpp | 8 +++---- include/llama.h | 4 ++-- src/llama-context.cpp | 8 +++---- src/llama-model.cpp | 2 +- src/llama-sampling.cpp | 4 ++-- src/llama-vocab.cpp | 35 ++++++++++++++-------------- src/llama-vocab.h | 2 +- src/llama.cpp | 6 ++--- tests/test-tokenizer-1-bpe.cpp | 2 +- tests/test-tokenizer-1-spm.cpp | 2 +- 19 files changed, 58 insertions(+), 59 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 07ac7ccad..39bfb0c2e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -950,7 +950,7 @@ struct common_init_result common_init_from_params(common_params & params) { } if (params.sampling.ignore_eos) { - for (llama_token i = 0; i < llama_vocab_n_vocab(vocab); i++) { + for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { if (llama_vocab_is_eog(vocab, i)) { LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); params.sampling.logit_bias.push_back({i, -INFINITY}); diff --git a/common/sampling.cpp b/common/sampling.cpp index 1d2c1815e..7241ac321 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -116,7 +116,7 @@ struct common_sampler { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); cur.resize(n_vocab); @@ -162,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_logit_bias( - llama_vocab_n_vocab(vocab), + llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data())); @@ -211,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_vocab(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); } else if (params.mirostat == 2) { llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); diff --git a/common/speculative.cpp b/common/speculative.cpp index ce8d179e1..318e96ea3 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -105,15 +105,15 @@ bool common_speculative_are_compatible( } { - const int n_vocab_tgt = llama_vocab_n_vocab(vocab_tgt); - const int n_vocab_dft = llama_vocab_n_vocab(vocab_dft); + const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt); + const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft); const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", - __func__, n_vocab_tgt, llama_vocab_n_vocab(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); return false; } diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 4e69849b7..b5f3feb9f 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -470,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); const int n_batch = params.n_batch; int count = 0; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 7b639f410..a3b4c5ac8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1402,7 +1402,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - const int32_t n_vocab = llama_vocab_n_vocab(vocab); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); std::vector tokens(n_batch); @@ -1426,7 +1426,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - const int32_t n_vocab = llama_vocab_n_vocab(vocab); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index e23060e24..2f0898e62 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -149,7 +149,7 @@ int main(int argc, char ** argv) { } // here we keep adding new n-grams as we go - ngram_container ngrams_observed(llama_vocab_n_vocab(vocab), N, G); + ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G); // debug struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index c594cdb5a..9bf6c5743 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -341,7 +341,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); int count = 0; double nll = 0.0; @@ -491,7 +491,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); int count = 0; double nll = 0.0; @@ -857,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1141,7 +1141,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); const int max_tasks_per_batch = 128; const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1495,7 +1495,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1704,8 +1704,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); return; } - if (n_vocab != llama_vocab_n_vocab(vocab)) { - LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_vocab(vocab)); + if (n_vocab != llama_vocab_n_tokens(vocab)) { + LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab)); } std::vector tokens(size_t(n_ctx) * n_chunk); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1374440a3..64c0c4ef6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -331,7 +331,7 @@ struct server_task { const auto & logit_bias = data.find("logit_bias"); if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); for (const auto & el : *logit_bias) { // TODO: we may want to throw errors here, in case "el" is incorrect if (el.is_array() && el.size() == 2) { @@ -2081,7 +2081,7 @@ struct server_context { void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { size_t n_probs = slot.params.sampling.n_probs; - size_t n_vocab = llama_vocab_n_vocab(vocab); + size_t n_vocab = llama_vocab_n_tokens(vocab); if (post_sampling) { const auto * cur_p = common_sampler_get_candidates(slot.smpl); const size_t max_probs = cur_p->size; @@ -3137,7 +3137,7 @@ struct server_context { json model_meta() const { return json { {"vocab_type", llama_vocab_type (vocab)}, - {"n_vocab", llama_vocab_n_vocab (vocab)}, + {"n_vocab", llama_vocab_n_tokens (vocab)}, {"n_ctx_train", llama_model_n_ctx_train(model)}, {"n_embd", llama_model_n_embd (model)}, {"n_params", llama_model_n_params (model)}, diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 0f75b1ea3..699480f90 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -776,7 +776,7 @@ static std::vector get_token_probabilities(llama_context * ctx const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); cur.resize(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index a85342fc1..c7ccea50d 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -116,8 +116,8 @@ int main(int argc, char ** argv) { } { - const int n_vocab_tgt = llama_vocab_n_vocab(vocab_tgt); - const int n_vocab_dft = llama_vocab_n_vocab(vocab_dft); + const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt); + const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft); const int vocab_diff = n_vocab_tgt > n_vocab_dft ? n_vocab_tgt - n_vocab_dft : n_vocab_dft - n_vocab_tgt; @@ -125,7 +125,7 @@ int main(int argc, char ** argv) { if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__); LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", - n_vocab_tgt, llama_vocab_n_vocab(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); return 1; } @@ -173,7 +173,7 @@ int main(int argc, char ** argv) { const auto t_enc_end = ggml_time_us(); // the 2 models should have the same vocab - //GGML_ASSERT(n_vocab == llama_vocab_n_vocab(model_dft)); + //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft)); // how many tokens to draft each time int n_draft = params.speculative.n_max; diff --git a/include/llama.h b/include/llama.h index 4685e21bf..9f04bc622 100644 --- a/include/llama.h +++ b/include/llama.h @@ -458,7 +458,7 @@ extern "C" { DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead"); DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead"); - DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_vocab instead"); + DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead"); LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx); LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @@ -476,7 +476,7 @@ extern "C" { LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab); - LLAMA_API int32_t llama_vocab_n_vocab(const struct llama_vocab * vocab); + LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab); // Functions to access the model's GGUF metadata scalar values // - The functions return the length of the string on success, or -1 on failure diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3cb06ee40..671d2a81a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -475,7 +475,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); const auto n_batch = cparams.n_batch; - const auto n_vocab = vocab.n_vocab(); + const auto n_vocab = vocab.n_tokens(); const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead @@ -542,7 +542,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) { void llama_output_reorder(struct llama_context & ctx) { std::vector & out_ids = ctx.sbatch.out_ids; if (!out_ids.empty()) { - const uint32_t n_vocab = ctx.model.vocab.n_vocab(); + const uint32_t n_vocab = ctx.model.vocab.n_tokens(); const uint32_t n_embd = ctx.model.hparams.n_embd; const int32_t n_outputs = ctx.n_outputs; @@ -726,7 +726,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs)); } - return ctx->logits + j*ctx->model.vocab.n_vocab(); + return ctx->logits + j*ctx->model.vocab.n_tokens(); } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG @@ -886,7 +886,7 @@ struct llama_data_write { } void write_logits(const struct llama_context * ctx) { - const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_vocab()); + const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens()); write(&logits_size, sizeof(logits_size)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 27afbba03..f90f5e746 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1367,7 +1367,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_ff = hparams.n_ff(); const int64_t n_embd_gqa = n_embd_v_gqa; - const int64_t n_vocab = vocab.n_vocab(); + const int64_t n_vocab = vocab.n_tokens(); const int64_t n_token_types = vocab.n_token_types(); const int64_t n_rot = hparams.n_rot; const int64_t n_expert = hparams.n_expert; diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 8775cf5e3..b3a12386e 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -374,7 +374,7 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); // TODO: do not allocate each time std::vector cur; @@ -1666,7 +1666,7 @@ struct llama_sampler_dry { // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am) static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap>& token_sequences, int max_tail_len = -1) { - for (llama_token token_id = 0; token_id < (llama_token) vocab.n_vocab(); token_id++) { + for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) { std::string word = vocab.detokenize({token_id}, true); if (word.find(str) != std::string::npos) { token_sequences.emplace(token_id, std::vector()); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ceff5ba7e..ed8751737 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -206,7 +206,7 @@ private: return; } - if (static_cast(token) >= vocab.n_vocab()) { + if (static_cast(token) >= vocab.n_tokens()) { return; } @@ -732,7 +732,7 @@ struct llm_tokenizer_ugm : llm_tokenizer { prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset; } - for (uint32_t id = 0; id < vocab.n_vocab(); ++id) { + for (uint32_t id = 0; id < vocab.n_tokens(); ++id) { const auto & token_data = vocab.get_token_data(id); if (vocab.is_normal(id)) { @@ -1117,7 +1117,7 @@ struct llm_tokenizer_rwkv : llm_tokenizer { // For now, we decode the vocab here into the lookup we'll use for tokenization. // build trie - for (uint32_t id = 0; id < vocab.n_vocab(); ++id) { + for (uint32_t id = 0; id < vocab.n_tokens(); ++id) { const auto & data = vocab.get_token_data(id); const auto text = llama_unescape_rwkv_token(data.text); token_matcher.insert((const char *) text.data(), text.size(), id); @@ -1202,7 +1202,6 @@ struct fragment_buffer_variant { }; struct llama_vocab::impl { - uint32_t n_vocab = 0; uint32_t n_token_types = 0; // for BERT-style token types enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; @@ -1358,9 +1357,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { linefeed_id = LLAMA_TOKEN_NULL; // read vocab size from metadata - if (!ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false)) { - n_vocab = 0; - LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, n_vocab will be set to %u\n", __func__, n_vocab); + uint32_t n_tokens = 0; + if (!ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) { + LLAMA_LOG_WARN("%s: there is no vocab_size in metadata\n", __func__); } return; @@ -1642,10 +1641,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); } - n_vocab = gguf_get_arr_n(ctx, token_idx); - id_to_token.resize(n_vocab); + uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx); + id_to_token.resize(n_tokens); - for (uint32_t i = 0; i < n_vocab; i++) { + for (uint32_t i = 0; i < n_tokens; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); if (word.empty()) { LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i); @@ -1955,7 +1954,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { // build special tokens cache { - for (llama_token id = 0; id < (llama_token) n_vocab; ++id) { + for (llama_token id = 0; id < (llama_token) n_tokens; ++id) { if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) { cache_special_tokens.push_back(id); } @@ -1974,9 +1973,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { { size_t size_cache = 0; - std::vector cache(n_vocab); + std::vector cache(n_tokens); - for (uint32_t id = 0; id < n_vocab; ++id) { + for (uint32_t id = 0; id < n_tokens; ++id) { cache[id] = token_to_piece_for_cache(id, true); size_cache += cache[id].size(); @@ -2690,7 +2689,7 @@ int32_t llama_vocab::impl::detokenize( void llama_vocab::impl::print_info() const { LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str()); - LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, n_vocab); + LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens()); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size()); // special tokens @@ -2738,7 +2737,7 @@ enum llama_vocab_pre_type llama_vocab::get_pre_type() const { return pimpl->pre_type; } -uint32_t llama_vocab::n_vocab() const { +uint32_t llama_vocab::n_tokens() const { return (uint32_t) pimpl->id_to_token.size(); } @@ -3025,13 +3024,13 @@ void llama_vocab::print_info() const { // interface implementation // -int32_t llama_vocab_n_vocab(const struct llama_vocab * vocab) { - return vocab->n_vocab(); +int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) { + return vocab->n_tokens(); } // deprecated int32_t llama_n_vocab(const struct llama_vocab * vocab) { - return llama_vocab_n_vocab(vocab); + return llama_vocab_n_tokens(vocab); } enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) { diff --git a/src/llama-vocab.h b/src/llama-vocab.h index bbc0a237d..020f2b533 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -24,7 +24,7 @@ struct llama_vocab { enum llama_vocab_type get_type() const; enum llama_vocab_pre_type get_pre_type() const; - uint32_t n_vocab() const; + uint32_t n_tokens() const; uint32_t n_token_types() const; std::string type_name() const; diff --git a/src/llama.cpp b/src/llama.cpp index fa8dff09d..daf1b7c97 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8470,7 +8470,7 @@ static int llama_decode_impl( if (batch.token) { for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_vocab()) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); return -1; } @@ -8490,7 +8490,7 @@ static int llama_decode_impl( llama_kv_slot_restorer kv_slot_restorer(kv_self); const int64_t n_embd = hparams.n_embd; - const int64_t n_vocab = vocab.n_vocab(); + const int64_t n_vocab = vocab.n_tokens(); uint32_t n_outputs = 0; uint32_t n_outputs_prev = 0; @@ -8805,7 +8805,7 @@ static int llama_encode_impl( if (batch.token) { for (uint32_t i = 0; i < n_tokens; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_vocab()) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); return -1; } diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 9360a061e..55425d88a 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -77,7 +77,7 @@ int main(int argc, char **argv) { atexit([]() { console::cleanup(); }); #endif - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); for (int i = 0; i < n_vocab; ++i) { std::string str = common_detokenize(ctx, std::vector(1, i)); diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp index da84308f2..9e7b77f31 100644 --- a/tests/test-tokenizer-1-spm.cpp +++ b/tests/test-tokenizer-1-spm.cpp @@ -65,7 +65,7 @@ int main(int argc, char ** argv) { atexit([]() { console::cleanup(); }); #endif - const int n_vocab = llama_vocab_n_vocab(vocab); + const int n_vocab = llama_vocab_n_tokens(vocab); for (int i = 0; i < n_vocab; ++i) { std::string str = common_detokenize(ctx, std::vector(1, i), true);