llama : add llama_vocab, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-01-12 11:32:42 +02:00 · 2025-01-12 11:32:42 +02:00 · afa8a9ec9b
commit afa8a9ec9b
parent c05e8c9934
68 changed files with 5855 additions and 5400 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -5,7 +5,6 @@
 #include "sampling.h"
 #include "llama.h"

-#include <cassert>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@ -163,6 +162,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
@ -196,7 +197,7 @@ int main(int argc, char ** argv) {

    llama_attach_threadpool(ctx, threadpool, threadpool_batch);

-    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx_train = llama_model_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);

    if (n_ctx > n_ctx_train) {
@ -241,9 +242,9 @@ int main(int argc, char ** argv) {
        }
    }

-    const bool add_bos = llama_add_bos_token(model);
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
    if (!llama_model_has_encoder(model)) {
-        GGML_ASSERT(!llama_add_eos_token(model));
+        GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
    }

    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
@ -269,7 +270,7 @@ int main(int argc, char ** argv) {
    // Should not run without any tokens
    if (embd_inp.empty()) {
        if (add_bos) {
-            embd_inp.push_back(llama_token_bos(model));
+            embd_inp.push_back(llama_vocab_bos(vocab));
            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
        } else {
            LOG_ERR("input is empty\n");
@ -495,7 +496,7 @@ int main(int argc, char ** argv) {

        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
-            decoder_start_token_id = llama_token_bos(model);
+            decoder_start_token_id = llama_vocab_bos(vocab);
        }

        embd_inp.clear();
@ -742,7 +743,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, common_sampler_last(smpl))) {
+            if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
                LOG_DBG("found an EOG token\n");

                if (params.interactive) {
@ -776,7 +777,7 @@ int main(int argc, char ** argv) {

                if (params.input_prefix_bos) {
                    LOG_DBG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(model));
+                    embd_inp.push_back(llama_vocab_bos(vocab));
                }

                std::string buffer;
@ -830,8 +831,8 @@ int main(int argc, char ** argv) {

                    // if user stop generation mid-way, we must add EOT to finish model's last response
                    if (need_insert_eot && format_chat) {
-                        llama_token eot = llama_token_eot(model);
-                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot);
+                        llama_token eot = llama_vocab_eot(vocab);
+                        embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
                        need_insert_eot = false;
                    }

@ -866,7 +867,7 @@ int main(int argc, char ** argv) {
        }

        // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
+        if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
            LOG(" [end of text]\n");
            break;
        }