llama : offload KV cache per-layer

2023-12-03 17:18:15 +02:00 · 2023-12-03 17:18:15 +02:00 · 986b3da76a
commit 986b3da76a
parent c294c78eb7
2 changed files with 86 additions and 151 deletions
--- a/llama.h
+++ b/llama.h
@ -196,6 +196,8 @@ extern "C" {
        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool embedding;  // embedding mode only
+        bool offload_k;
+        bool offload_v;
    };

    // model quantization parameters