llama : add llama_vocab, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-01-12 11:32:42 +02:00 · 2025-01-12 11:32:42 +02:00 · afa8a9ec9b
commit afa8a9ec9b
parent c05e8c9934
68 changed files with 5855 additions and 5400 deletions
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -84,6 +84,7 @@ int main(int argc, char ** argv) {
    model_params.n_gpu_layers = ngl;

    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
+    const llama_vocab * vocab = llama_model_get_vocab(model);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@ -93,11 +94,11 @@ int main(int argc, char ** argv) {
    // tokenize the prompt

    // find the number of tokens in the prompt
-    const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);

    // allocate space for the tokens and tokenize the prompt
    std::vector<llama_token> prompt_tokens(n_prompt);
-    if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
+    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
        return 1;
    }
@ -112,7 +113,7 @@ int main(int argc, char ** argv) {
    // enable performance counters
    ctx_params.no_perf = false;

-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+    llama_context * ctx = llama_init_from_model(model, ctx_params);

    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@ -131,7 +132,7 @@ int main(int argc, char ** argv) {

    for (auto id : prompt_tokens) {
        char buf[128];
-        int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
+        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
            return 1;
@ -164,12 +165,12 @@ int main(int argc, char ** argv) {
            new_token_id = llama_sampler_sample(smpl, ctx, -1);

            // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id)) {
+            if (llama_vocab_is_eog(vocab, new_token_id)) {
                break;
            }

            char buf[128];
-            int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
+            int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
            if (n < 0) {
                fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
                return 1;