llama : update llama_kv_self API

ggml-ci
2025-01-14 16:47:34 +02:00 · 2025-01-14 16:47:34 +02:00 · 17b363afd3
commit 17b363afd3
parent fd05ab87aa
30 changed files with 387 additions and 205 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1693,7 +1693,6 @@ struct server_context {

    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
-    llama_kv_cache * kv = nullptr;

    const llama_vocab * vocab = nullptr;

@ -1756,8 +1755,6 @@ struct server_context {
            return false;
        }

-        kv = llama_get_kv_cache(ctx);
-
        vocab = llama_model_get_vocab(model);

        n_ctx = llama_n_ctx(ctx);
@ -2026,7 +2023,7 @@ struct server_context {
        SRV_DBG("%s", "clearing KV cache\n");

        // clear the entire KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
        clean_kv_cache = false;
    }

@ -2568,8 +2565,8 @@ struct server_context {
                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                    res->t_start             = metrics.t_start;

-                    res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv);
-                    res->kv_cache_used_cells   = llama_kv_cache_used_cells(kv);
+                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
+                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);

                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@ -2685,7 +2682,7 @@ struct server_context {

                    // Erase token cache
                    const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(kv, slot->id, -1, -1);
+                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                    slot->cache_tokens.clear();

                    auto res = std::make_unique<server_task_result_slot_erase>();
@ -2753,8 +2750,8 @@ struct server_context {

                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);

-                llama_kv_cache_seq_rm (kv, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);

                if (slot.params.cache_prompt) {
                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@ -2941,8 +2938,8 @@ struct server_context {

                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;

-                                            llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(kv, slot.id, head_c, -1,     kv_shift);
+                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_self_seq_add(ctx, slot.id, head_c, -1,     kv_shift);

                                            for (size_t i = 0; i < n_match; i++) {
                                                slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@ -2980,9 +2977,9 @@ struct server_context {
                    }

                    // keep only the common part
-                    if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                        // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(kv, slot.id, -1, -1);
+                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);

                        // there is no common part left
                        slot.n_past = 0;
@ -3222,7 +3219,7 @@ struct server_context {
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);

-                llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1);
+                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);

                for (size_t i = 0; i < ids.size(); ++i) {
                    completion_token_output result;