llama : add llama_kv_cache_compress (EXPERIMENTAL)

2024-02-25 22:16:13 +02:00 · 2024-02-25 22:16:13 +02:00 · 14d757066b
commit 14d757066b
parent c24a2a6e60
3 changed files with 262 additions and 0 deletions
--- a/llama.h
+++ b/llama.h
@ -557,6 +557,14 @@ extern "C" {
            struct llama_context * ctx,
                    llama_seq_id   seq_id);

+    // [EXPERIMENTAL] Compress the data in the KV cache
+    // This will be applied:
+    //   - lazily on next llama_decode()
+    //   - explicitly with llama_kv_cache_update()
+    LLAMA_API void llama_kv_cache_compress(
+            struct llama_context * ctx,
+                       llama_pos   delta);
+
    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()