llama : allow exporting a view of the KV cache (#4180)

* Allow exporting a view of the KV cache * Allow dumping the sequences per cell in common * Track max contiguous cells value and position as well * Fix max contiguous empty cells index calculation Make dump functions deal with lengths or sequences counts > 10 better * Fix off by one error in dump_kv_cache_view * Add doc comments for KV cache view functions Eliminate cell sequence struct; use llama_seq_id directly Minor cleanups
2023-11-23 09:31:20 -07:00 · 2023-11-23 09:31:20 -07:00 · 5df7d06c42
commit 5df7d06c42
parent 671f639c59
5 changed files with 227 additions and 0 deletions
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -172,6 +172,8 @@ int main(int argc, char ** argv) {
    int32_t n_total_gen    = 0;
    int32_t n_cache_miss   = 0;

+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
+
    const auto t_main_start = ggml_time_us();

    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
@ -201,6 +203,9 @@ int main(int argc, char ** argv) {
    LOG_TEE("Processing requests ...\n\n");

    while (true) {
+        llama_kv_cache_view_update(ctx, &kvc_view);
+        dump_kv_cache_view_seqs(kvc_view, 40);
+
        llama_batch_clear(batch);

        // decode any currently ongoing sequences