From f78b396ee7a5d4c47cf3e3a8cb9fb02a4d3fe250 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 14:13:11 +0200
Subject: [PATCH 01/28] llama : add struct llama_kv_cache (wip) [no ci]

---
 common/common.cpp                |   6 +-
 common/speculative.cpp           |  10 +-
 examples/embedding/embedding.cpp |   5 +-
 include/llama.h                  |  79 +++----
 src/llama-context.cpp            |  16 +-
 src/llama-kv-cache.cpp           | 286 ++++---------------------
 src/llama-kv-cache.h             | 350 ++++++++++++++++++++++++++-----
 src/llama.cpp                    |  91 ++------
 8 files changed, 428 insertions(+), 415 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6dea8e3d2..29de45189 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -952,7 +952,9 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+    llama_kv_cache * kv = llama_get_kv_cache(lctx);
+
+    if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) {
         LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -1057,7 +1059,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_cache_clear(lctx);
+        llama_kv_cache_clear(kv);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
     }
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 318e96ea3..6ac058517 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -171,8 +171,10 @@ llama_tokens common_speculative_gen_draft(
     llama_tokens result;
     result.reserve(params.n_draft);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     if (reuse_n == 0) {
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         prompt.clear();
     } else {
@@ -191,14 +193,14 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            llama_kv_cache_seq_rm (kv, 0, 0, reuse_i);
+            llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i);
 
             prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
         }
 
         if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+            llama_kv_cache_seq_rm (kv, 0, reuse_n, -1);
 
             prompt.erase(prompt.begin() + reuse_n, prompt.end());
         }
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 38d22c90f..fda0949f1 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -34,10 +34,11 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const struct llama_model * model = llama_get_model(ctx);
+    const llama_model * model = llama_get_model(ctx);
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache_clear(kv);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/include/llama.h b/include/llama.h
index 3b75e7607..08b8658ad 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -60,6 +60,7 @@ extern "C" {
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
+    struct llama_kv_cache;
 
     typedef int32_t llama_pos;
     typedef int32_t llama_token;
@@ -467,8 +468,9 @@ extern "C" {
 
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
-    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx); // TODO: remove const?
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_cache(      struct llama_context * ctx);
+    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx);
 
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@@ -584,7 +586,7 @@ extern "C" {
     // KV cache
     //
 
-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache
 
     // Information associated with an individual cell in the KV cache view.
     struct llama_kv_cache_view_cell {
@@ -639,14 +641,20 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "use llama_kv_cache_n_tokens instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv);
+
+    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
+            "use llama_kv_cache_used_cells instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_cache_clear(
-            struct llama_context * ctx);
+            struct llama_kv_cache * kv);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@@ -654,26 +662,26 @@ extern "C" {
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
     LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1);
 
     // Copy all tokens that belong to the specified sequence to another sequence
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id_src,
-                    llama_seq_id   seq_id_dst,
-                       llama_pos   p0,
-                       llama_pos   p1);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id_src,
+                     llama_seq_id   seq_id_dst,
+                        llama_pos   p0,
+                        llama_pos   p1);
 
     // Removes all tokens that do not belong to the specified sequence
     LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -682,11 +690,11 @@ extern "C" {
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                       llama_pos   delta);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1,
+                        llama_pos   delta);
 
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
@@ -695,31 +703,28 @@ extern "C" {
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id,
+                        llama_pos   p0,
+                        llama_pos   p1,
+                              int   d);
 
     // Returns the largest position present in the KV cache for the specified sequence
     LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id);
-
-    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
-    //       how to avoid this?
+            struct llama_kv_cache * kv,
+                     llama_seq_id   seq_id);
 
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
     //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-
-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
+    LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv);
 
     // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+    LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv);
+
+    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
+    LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv);
 
     //
     // State / sessions
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 671d2a81a..bf5a77cca 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -602,11 +602,15 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) {
     return ctx->kv_self.size;
 }
 
-const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+const llama_model * llama_get_model(const llama_context * ctx) {
     return &ctx->model;
 }
 
-enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+llama_kv_cache * llama_get_kv_cache(llama_context * ctx) {
+    return &ctx->kv_self;
+}
+
+enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
     return ctx->cparams.pooling_type;
 }
 
@@ -1142,7 +1146,7 @@ struct llama_data_read {
         if (dest_seq_id != -1) {
             // single sequence
 
-            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+            kv_self.seq_rm(dest_seq_id, -1, -1);
 
             llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
             batch.n_tokens = cell_count;
@@ -1185,7 +1189,7 @@ struct llama_data_read {
                 return false;
             }
 
-            llama_kv_cache_clear(kv_self);
+            kv_self.clear();
 
             for (uint32_t i = 0; i < cell_count; ++i) {
                 llama_kv_cell & cell = kv_self.cells[i];
@@ -1362,9 +1366,9 @@ struct llama_data_read {
 
         if (!res) {
             if (seq_id == -1) {
-                llama_kv_cache_clear(ctx);
+                ctx->kv_self.clear();
             } else {
-                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
+                ctx->kv_self.seq_rm(seq_id, -1, -1);
             }
             throw std::runtime_error("failed to restore kv cache");
         }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index feffdf0de..b0d5a9318 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -350,277 +350,67 @@ uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     return 0;
 }
 
-void llama_kv_cache_clear(struct llama_kv_cache & cache) {
-    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
-        cache.cells[i].pos = -1;
-        cache.cells[i].seq_id.clear();
-        cache.cells[i].src = -1;
-        cache.cells[i].tail = -1;
-    }
-    cache.head = 0;
-    cache.used = 0;
-
-    for (auto & buf : cache.bufs) {
-        ggml_backend_buffer_clear(buf.get(), 0);
-    }
+void llama_kv_cache_clear(llama_kv_cache * kv) {
+    kv->clear();
 }
 
 bool llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1) {
-    uint32_t new_head = cache.size;
-
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    // models like Mamba or RWKV can't have a state partially erased
-    if (cache.recurrent) {
-        if (seq_id >= (int64_t) cache.size) {
-            // could be fatal
-            return false;
-        }
-        if (0 <= seq_id) {
-            int32_t & tail_id = cache.cells[seq_id].tail;
-            if (tail_id >= 0) {
-                const llama_kv_cell & cell = cache.cells[tail_id];
-                // partial intersection is invalid
-                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
-                    return false;
-                }
-                // invalidate tails which will be cleared
-                if (p0 <= cell.pos && cell.pos < p1) {
-                    tail_id = -1;
-                }
-            }
-        } else {
-            // seq_id is negative, then the range should include everything or nothing
-            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
-                return false;
-            }
-        }
-    }
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            if (seq_id < 0) {
-                cache.cells[i].seq_id.clear();
-            } else if (cache.cells[i].has_seq_id(seq_id)) {
-                cache.cells[i].seq_id.erase(seq_id);
-            } else {
-                continue;
-            }
-            if (cache.cells[i].is_empty()) {
-                // keep count of the number of used cells
-                if (cache.cells[i].pos >= 0) cache.used--;
-
-                cache.cells[i].pos = -1;
-                cache.cells[i].src = -1;
-                if (new_head == cache.size) new_head = i;
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
-
-    return true;
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1) {
+    return kv->seq_rm(seq_id, p0, p1);
 }
 
 void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-
-    if (cache.recurrent) {
-        if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
-            llama_kv_cell & tail_src = cache.cells[seq_id_src];
-            llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
-            if (tail_dst.tail >= 0) {
-                // clear destination seq_id if it wasn't empty
-                llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
-
-                cell_dst.seq_id.erase(seq_id_dst);
-                tail_dst.tail = -1;
-                if (cell_dst.seq_id.empty()) {
-                    cell_dst.pos = -1;
-                    cell_dst.delta = -1;
-                    cell_dst.src = -1;
-                    cache.used -= 1;
-                }
-            }
-            if (tail_src.tail >= 0) {
-                llama_kv_cell & cell_src = cache.cells[tail_src.tail];
-
-                cell_src.seq_id.insert(seq_id_dst);
-                tail_dst.tail = tail_src.tail;
-            }
-        }
-
-        return;
-    }
-    // otherwise, this is the KV cache of a Transformer-like model
-
-    cache.head = 0;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.cells[i].seq_id.insert(seq_id_dst);
-        }
-    }
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id_src,
+          llama_seq_id   seq_id_dst,
+             llama_pos   p0,
+             llama_pos   p1) {
+    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 
-void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
-    uint32_t new_head = cache.size;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.recurrent && (llama_seq_id) i != seq_id) {
-            cache.cells[i].tail = -1;
-        }
-        if (!cache.cells[i].has_seq_id(seq_id)) {
-            if (cache.cells[i].pos >= 0) cache.used--;
-            cache.cells[i].pos = -1;
-            cache.cells[i].src = -1;
-            cache.cells[i].seq_id.clear();
-            if (new_head == cache.size) new_head = i;
-        } else {
-            cache.cells[i].seq_id.clear();
-            cache.cells[i].seq_id.insert(seq_id);
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
+void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id) {
+    kv->seq_keep(seq_id);
 }
 
 void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta) {
-    uint32_t new_head = cache.size;
-
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) return;
-
-    if (cache.recurrent) {
-        // for Mamba-like or RWKV models, only the pos needs to be shifted
-        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
-            const int32_t tail_id = cache.cells[seq_id].tail;
-            if (tail_id >= 0) {
-                llama_kv_cell & cell = cache.cells[tail_id];
-                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                    cell.pos += delta;
-                }
-            }
-        }
-        return;
-    }
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-            cache.cells[i].pos   += delta;
-            cache.cells[i].delta += delta;
-
-            if (cache.cells[i].pos < 0) {
-                if (!cache.cells[i].is_empty()) {
-                    cache.used--;
-                }
-                cache.cells[i].pos = -1;
-                cache.cells[i].seq_id.clear();
-                if (new_head == cache.size) {
-                    new_head = i;
-                }
-            }
-        }
-    }
-
-    // If we freed up a slot, set head to it so searching can start there.
-    // Otherwise we just start the next search from the beginning.
-    cache.head = new_head != cache.size ? new_head : 0;
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+             llama_pos   delta) {
+    kv->seq_add(seq_id, p0, p1, delta);
 }
 
 void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d) {
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
-    // If there is no range then return early to avoid looping over the cache.
-    if (p0 == p1) return;
-
-    if (cache.recurrent) {
-        // for Mamba-like or RWKV models, only the pos needs to be changed
-        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
-            const int32_t tail_id = cache.cells[seq_id].tail;
-            if (tail_id >= 0) {
-                llama_kv_cell & cell = cache.cells[tail_id];
-                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                    cell.pos /= d;
-                }
-            }
-        }
-        return;
-    }
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
-            cache.has_shift = true;
-
-            {
-                llama_pos p_old = cache.cells[i].pos;
-                cache.cells[i].pos   /= d;
-                cache.cells[i].delta += cache.cells[i].pos - p_old;
-            }
-        }
-    }
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+                   int   d) {
+    kv->seq_div(seq_id, p0, p1, d);
 }
 
-llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
-    llama_pos result = 0;
-
-    for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id)) {
-            result = std::max(result, cache.cells[i].pos);
-        }
-    }
-
-    return result;
+llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id) {
+    return kv->seq_pos_max(seq_id);
 }
 
-void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
-    if (!cache.recurrent) {
-        cache.do_defrag = true;
-    }
+void llama_kv_cache_defrag(llama_kv_cache * kv) {
+    kv->defrag();
 }
 
-int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv) {
-    int result = 0;
-
-    for (uint32_t i = 0; i < kv.size; i++) {
-        result += kv.cells[i].seq_id.size();
-    }
-
-    return result;
+int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
+    return kv->n_tokens();
 }
 
-int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv) {
-    return kv.used;
+int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
+    return kv->used;
 }
 
-bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv) {
-    return kv.can_shift;
+bool llama_kv_cache_can_shift(const llama_kv_cache * kv) {
+    return kv->can_shift;
 }
 
 //
@@ -632,7 +422,7 @@ struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache
         /*.n_cells            = */ 0,
         /*.n_seq_max          = */ n_seq_max,
         /*.token_count        = */ 0,
-        /*.used_cells         = */ llama_get_kv_cache_used_cells(kv),
+        /*.used_cells         = */ llama_kv_cache_used_cells(&kv),
         /*.max_contiguous     = */ 0,
         /*.max_contiguous_idx = */ -1,
         /*.cells              = */ nullptr,
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index dca6f3998..b0bb1cfb1 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -57,6 +57,16 @@ struct llama_kv_cache {
     std::vector<ggml_context_ptr> ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
+    int32_t n_tokens() const {
+        int32_t result = 0;
+
+        for (uint32_t i = 0; i < size; i++) {
+            result += cells[i].seq_id.size();
+        }
+
+        return result;
+    }
+
     size_t total_size() const {
         size_t size = 0;
         for (const auto & buf : bufs) {
@@ -75,6 +85,297 @@ struct llama_kv_cache {
 
         return max_pos;
     }
+
+    void clear() {
+        for (int32_t i = 0; i < (int32_t) size; ++i) {
+            cells[i].pos = -1;
+            cells[i].seq_id.clear();
+            cells[i].src = -1;
+            cells[i].tail = -1;
+        }
+        head = 0;
+        used = 0;
+
+        for (auto & buf : bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    }
+
+    bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+        uint32_t new_head = size;
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        // models like Mamba or RWKV can't have a state partially erased
+        if (recurrent) {
+            if (seq_id >= (int64_t) size) {
+                // could be fatal
+                return false;
+            }
+            if (0 <= seq_id) {
+                int32_t & tail_id = cells[seq_id].tail;
+                if (tail_id >= 0) {
+                    const llama_kv_cell & cell = cells[tail_id];
+                    // partial intersection is invalid
+                    if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                        return false;
+                    }
+                    // invalidate tails which will be cleared
+                    if (p0 <= cell.pos && cell.pos < p1) {
+                        tail_id = -1;
+                    }
+                }
+            } else {
+                // seq_id is negative, then the range should include everything or nothing
+                if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                    return false;
+                }
+            }
+        }
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].pos >= p0 && cells[i].pos < p1) {
+                if (seq_id < 0) {
+                    cells[i].seq_id.clear();
+                } else if (cells[i].has_seq_id(seq_id)) {
+                    cells[i].seq_id.erase(seq_id);
+                } else {
+                    continue;
+                }
+                if (cells[i].is_empty()) {
+                    // keep count of the number of used cells
+                    if (cells[i].pos >= 0) {
+                        used--;
+                    }
+
+                    cells[i].pos = -1;
+                    cells[i].src = -1;
+
+                    if (new_head == size) {
+                        new_head = i;
+                    }
+                }
+            }
+        }
+
+        // If we freed up a slot, set head to it so searching can start there.
+        if (new_head != size && new_head < head) {
+            head = new_head;
+        }
+
+        return true;
+    }
+
+    void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+        if (seq_id_src == seq_id_dst) {
+            return;
+        }
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        if (recurrent) {
+            if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+                llama_kv_cell & tail_src = cells[seq_id_src];
+                llama_kv_cell & tail_dst = cells[seq_id_dst];
+                if (tail_dst.tail >= 0) {
+                    // clear destination seq_id if it wasn't empty
+                    llama_kv_cell & cell_dst = cells[tail_dst.tail];
+
+                    cell_dst.seq_id.erase(seq_id_dst);
+                    tail_dst.tail = -1;
+                    if (cell_dst.seq_id.empty()) {
+                        cell_dst.pos = -1;
+                        cell_dst.delta = -1;
+                        cell_dst.src = -1;
+                        used -= 1;
+                    }
+                }
+                if (tail_src.tail >= 0) {
+                    llama_kv_cell & cell_src = cells[tail_src.tail];
+
+                    cell_src.seq_id.insert(seq_id_dst);
+                    tail_dst.tail = tail_src.tail;
+                }
+            }
+
+            return;
+        }
+
+        // otherwise, this is the KV of a Transformer-like model
+        head = 0;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
+                cells[i].seq_id.insert(seq_id_dst);
+            }
+        }
+    }
+
+    void seq_keep(llama_seq_id seq_id) {
+        uint32_t new_head = size;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (recurrent && (llama_seq_id) i != seq_id) {
+                cells[i].tail = -1;
+            }
+
+            if (!cells[i].has_seq_id(seq_id)) {
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+                cells[i].src = -1;
+                cells[i].seq_id.clear();
+
+                if (new_head == size){
+                    new_head = i;
+                }
+            } else {
+                cells[i].seq_id.clear();
+                cells[i].seq_id.insert(seq_id);
+            }
+        }
+
+        // If we freed up a slot, set head to it so searching can start there.
+        if (new_head != size && new_head < head) {
+            head = new_head;
+        }
+    }
+
+    void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+        if (delta == 0) {
+            return;
+        }
+
+        uint32_t new_head = size;
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        // If there is no range then return early to avoid looping over the
+        if (p0 == p1) {
+            return;
+        }
+
+        if (recurrent) {
+            // for Mamba-like or RWKV models, only the pos needs to be shifted
+            if (0 <= seq_id && seq_id < (int64_t) size) {
+                const int32_t tail_id = cells[seq_id].tail;
+                if (tail_id >= 0) {
+                    llama_kv_cell & cell = cells[tail_id];
+                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                        cell.pos += delta;
+                    }
+                }
+            }
+            return;
+        }
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+                has_shift = true;
+                cells[i].pos   += delta;
+                cells[i].delta += delta;
+
+                if (cells[i].pos < 0) {
+                    if (!cells[i].is_empty()) {
+                        used--;
+                    }
+                    cells[i].pos = -1;
+                    cells[i].seq_id.clear();
+                    if (new_head == size) {
+                        new_head = i;
+                    }
+                }
+            }
+        }
+
+        // If we freed up a slot, set head to it so searching can start there.
+        // Otherwise we just start the next search from the beginning.
+        head = new_head != size ? new_head : 0;
+    }
+
+    void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+        if (d == 1) {
+            return;
+        }
+
+        if (p0 < 0) {
+            p0 = 0;
+        }
+
+        if (p1 < 0) {
+            p1 = std::numeric_limits<llama_pos>::max();
+        }
+
+        // If there is no range then return early to avoid looping over the cache.
+        if (p0 == p1) {
+            return;
+        }
+
+        if (recurrent) {
+            // for Mamba-like or RWKV models, only the pos needs to be changed
+            if (0 <= seq_id && seq_id < (int64_t) size) {
+                const int32_t tail_id = cells[seq_id].tail;
+                if (tail_id >= 0) {
+                    llama_kv_cell & cell = cells[tail_id];
+                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                        cell.pos /= d;
+                    }
+                }
+            }
+
+            return;
+        }
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+                has_shift = true;
+
+                {
+                    llama_pos p_old = cells[i].pos;
+                    cells[i].pos   /= d;
+                    cells[i].delta += cells[i].pos - p_old;
+                }
+            }
+        }
+    }
+
+    llama_pos seq_pos_max(llama_seq_id seq_id) {
+        llama_pos result = 0;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (cells[i].has_seq_id(seq_id)) {
+                result = std::max(result, cells[i].pos);
+            }
+        }
+
+        return result;
+    }
+
+    void defrag() {
+        if (!recurrent) {
+            do_defrag = true;
+        }
+    }
 };
 
 // a structure holds information about the slot found in llama_kv_cache_find_slot
@@ -112,51 +413,6 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
 // find how many cells are currently in use
 uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
 
-void llama_kv_cache_clear(struct llama_kv_cache & cache);
-
-bool llama_kv_cache_seq_rm(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1);
-
-void llama_kv_cache_seq_cp(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id_src,
-                 llama_seq_id   seq_id_dst,
-                    llama_pos   p0,
-                    llama_pos   p1);
-
-void llama_kv_cache_seq_keep(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-
-void llama_kv_cache_seq_add(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                    llama_pos   delta);
-
-void llama_kv_cache_seq_div(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id,
-                    llama_pos   p0,
-                    llama_pos   p1,
-                          int   d);
-
-llama_pos llama_kv_cache_seq_pos_max(
-        struct llama_kv_cache & cache,
-                 llama_seq_id   seq_id);
-
-void llama_kv_cache_defrag(struct llama_kv_cache & cache);
-
-int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
-
-int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
-
-bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
-
 //
 // kv cache view
 //
@@ -206,10 +462,10 @@ struct llama_kv_slot_restorer {
             cache.n    = old_state.n;
 
             if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
-                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+                cache.seq_rm(-1, -1, -1);
             } else {
                 for (auto & slot : slot_boundaries) {
-                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                    cache.seq_rm(-1, slot.first, slot.second);
                 }
             }
         }
diff --git a/src/llama.cpp b/src/llama.cpp
index 094157ccf..87dd512b2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8564,7 +8564,7 @@ static int llama_decode_impl(
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_kv_cache_update(&lctx);
+            llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache()
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -8760,7 +8760,7 @@ static int llama_decode_impl(
         if (fragmentation > cparams.defrag_thold) {
             //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
 
-            llama_kv_cache_defrag(kv_self);
+            kv_self.defrag();
         }
     }
 
@@ -9182,11 +9182,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
     //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
 }
 
-static void llama_kv_cache_update_impl(struct llama_context & lctx) {
+static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) {
     bool need_reserve = false;
 
-    if (lctx.kv_self.has_shift) {
-        if (!llama_kv_cache_can_shift(&lctx)) {
+    if (kv.has_shift) {
+        if (!kv.can_shift) {
             GGML_ABORT("The current context does not support K-shift");
         }
 
@@ -9206,23 +9206,21 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
         }
 
         {
-            auto & kv_self = lctx.kv_self;
+            kv.has_shift = false;
 
-            kv_self.has_shift = false;
-
-            for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = 0;
+            for (uint32_t i = 0; i < kv.size; ++i) {
+                kv.cells[i].delta = 0;
             }
         }
     }
 
     // defragment the KV cache if needed
-    if (lctx.kv_self.do_defrag) {
+    if (kv.do_defrag) {
         llama_kv_cache_defrag_impl(lctx);
 
         need_reserve = true;
 
-        lctx.kv_self.do_defrag = false;
+        kv.do_defrag = false;
     }
 
     // reserve a worst case graph again
@@ -9845,6 +9843,7 @@ struct llama_context * llama_init_from_model(
     return ctx;
 }
 
+// deprecated
 struct llama_context * llama_new_context_with_model(
                  struct llama_model * model,
         struct llama_context_params   params) {
@@ -9855,73 +9854,27 @@ struct llama_context * llama_new_context_with_model(
 // kv cache
 //
 
-// TODO: tmp bridges below until `struct llama_kv_cache` is exposed through the public API
-
-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
+struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
     return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
 }
 
-void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
+void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
     llama_kv_cache_view_update(view, ctx->kv_self);
 }
 
-int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
-    return llama_get_kv_cache_token_count(ctx->kv_self);
+// deprecated
+int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_cache_n_tokens(&ctx->kv_self);
 }
 
-int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
-    return llama_get_kv_cache_used_cells(ctx->kv_self);
+// deprecated
+int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_cache_used_cells(&ctx->kv_self);
 }
 
-void llama_kv_cache_clear(struct llama_context * ctx) {
-    llama_kv_cache_clear(ctx->kv_self);
-}
-
-bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
-}
-
-void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    if (seq_id_src == seq_id_dst) {
-        return;
-    }
-    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
-}
-
-void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    if (delta == 0) {
-        return;
-    }
-
-    llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
-}
-
-void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-    if (d == 1) {
-        return;
-    }
-
-    llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
-}
-
-llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
-}
-
-void llama_kv_cache_defrag(struct llama_context * ctx) {
-    llama_kv_cache_defrag(ctx->kv_self);
-}
-
-void llama_kv_cache_update(struct llama_context * ctx) {
-    llama_kv_cache_update_impl(*ctx);
-}
-
-bool llama_kv_cache_can_shift(struct llama_context * ctx) {
-    return llama_kv_cache_can_shift(ctx->kv_self);
+// TODO: move to llama-context
+void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) {
+    llama_update_kv_cache_impl(*ctx, *kv);
 }
 
 ///

From e4550fbafc44403b243fe029937a97a0aed7bbd6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 14:56:52 +0200
Subject: [PATCH 02/28] llama : cont

ggml-ci
---
 examples/batched-bench/batched-bench.cpp      |  6 ++--
 .../cvector-generator/cvector-generator.cpp   |  3 +-
 examples/gritlm/gritlm.cpp                    |  8 +++--
 examples/imatrix/imatrix.cpp                  |  4 ++-
 examples/infill/infill.cpp                    |  6 ++--
 examples/llama-bench/llama-bench.cpp          |  6 ++--
 examples/lookahead/lookahead.cpp              | 13 ++++----
 examples/lookup/lookup.cpp                    |  3 +-
 examples/main/main.cpp                        | 14 +++++----
 examples/parallel/parallel.cpp                | 11 +++----
 examples/passkey/passkey.cpp                  | 30 ++++++++++---------
 examples/perplexity/perplexity.cpp            | 24 +++++++++++----
 examples/retrieval/retrieval.cpp              |  4 ++-
 examples/run/run.cpp                          |  7 +++--
 examples/save-load-state/save-load-state.cpp  |  4 ++-
 examples/server/server.cpp                    | 25 +++++++++-------
 examples/simple-chat/simple-chat.cpp          |  6 ++--
 .../speculative-simple/speculative-simple.cpp |  4 ++-
 examples/speculative/speculative.cpp          | 29 ++++++++++--------
 19 files changed, 128 insertions(+), 79 deletions(-)

diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 0659ab6f1..fcbad37bb 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -57,6 +57,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const int32_t n_kv_max = llama_n_ctx(ctx);
 
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@@ -132,7 +134,7 @@ int main(int argc, char ** argv) {
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_kv_cache_clear(ctx);
+                llama_kv_cache_clear(kv);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                     LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                        llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
                     }
                 }
 
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 413b71d34..adb4a60ad 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -342,7 +342,8 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+    llama_kv_cache_clear(kv);
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 72eb46257..16437453e 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -13,6 +13,8 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
@@ -45,7 +47,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
         llama_set_embeddings(ctx, true);
         llama_set_causal_attn(ctx, false);
 
@@ -100,9 +102,11 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     llama_token eos_token = llama_vocab_eos(vocab);
 
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache_clear(kv);
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index b5f3feb9f..5efe4f019 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -431,6 +431,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -497,7 +499,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 489a208b6..de8e77695 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -139,6 +139,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     const int n_ctx_train = llama_model_n_ctx_train(model);
@@ -332,8 +334,8 @@ int main(int argc, char ** argv) {
                 LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 4ac19ca86..8843c0048 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1575,9 +1575,11 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
+        llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // cool off before the test
         if (params.delay) {
@@ -1617,7 +1619,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_clear(ctx);
+            llama_kv_cache_clear(kv);
 
             uint64_t t_start = get_time_ns();
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 2f0898e62..1219c2074 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -60,6 +60,7 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -95,7 +96,7 @@ int main(int argc, char ** argv) {
     llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+        llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
     }
 
     const auto t_enc_end = ggml_time_us();
@@ -437,17 +438,17 @@ int main(int argc, char ** argv) {
 
         // KV cache management
         // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
+        llama_kv_cache_seq_rm(kv, -1, n_past, -1);
 
         if (seq_id_best != 0) {
             // if a verification token matched, we keep the best sequence and remove the rest
             // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(ctx, seq_id_best);
-            llama_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
+            llama_kv_cache_seq_keep(kv, seq_id_best);
+            llama_kv_cache_seq_cp  (kv, seq_id_best, 0, -1, -1);
+            llama_kv_cache_seq_rm  (kv, seq_id_best,    -1, -1);
 
             for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+                llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
             }
         }
     }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index dbd0444ec..8628f7318 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -35,6 +35,7 @@ int main(int argc, char ** argv){
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -192,7 +193,7 @@ int main(int argc, char ** argv){
 
         // KV cache management
         // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+        llama_kv_cache_seq_rm(kv, 0, n_past, -1);
 
         common_batch_clear(batch_tgt);
         common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index da2a03ab9..9d79af79e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -157,6 +157,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const llama_vocab * vocab = llama_model_get_vocab(model);
     auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
 
@@ -328,7 +330,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1);
     }
 
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -569,8 +571,8 @@ int main(int argc, char ** argv) {
                     LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_cache_seq_rm (kv, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -593,9 +595,9 @@ int main(int argc, char ** argv) {
                     LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_cache_seq_add(kv, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7ef43d5e1..2ba0706dc 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -134,6 +134,7 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -201,7 +202,7 @@ int main(int argc, char ** argv) {
 
         // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+            llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
         }
 
         LOG_INF("\n");
@@ -233,9 +234,9 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                llama_kv_cache_seq_rm(kv, i, -1, -1);
                 // but keep the system prompt
-                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
             }
 
             LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -371,8 +372,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
-                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
+                    llama_kv_cache_seq_rm(kv,    client.id + 1, -1, -1);
+                    llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 5953928d4..e2764313b 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -86,6 +86,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     auto sparams = llama_sampler_chain_default_params();
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
@@ -132,11 +134,11 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_update  (ctx);
+            llama_kv_cache_seq_add(kv, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_update_kv_cache (ctx, kv);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
         }
 
         common_batch_clear(batch);
@@ -166,12 +168,12 @@ int main(int argc, char ** argv) {
 
         LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
-        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (ctx);
-        llama_kv_cache_update (ctx);
+        llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
+        llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
+      //llama_kv_cache_defrag (kv);
+        llama_update_kv_cache (ctx, kv);
 
-        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+        n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
 
         common_batch_clear(batch);
 
@@ -197,12 +199,12 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
-            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (ctx);
-            llama_kv_cache_update (ctx);
+            llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
+            llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
+          //llama_kv_cache_defrag (kv);
+            llama_update_kv_cache (ctx, kv);
 
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
         }
     }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9bf6c5743..6c9f716ed 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -299,6 +299,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -360,7 +362,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
@@ -450,6 +452,8 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -546,7 +550,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -741,6 +745,8 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -923,7 +929,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1084,6 +1090,8 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     constexpr int k_min_trailing_ctx = 3;
 
     auto data = load_winogrande_from_csv(params.prompt);
@@ -1202,7 +1210,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1388,6 +1396,8 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
     strstream.read((char *)&n_task, sizeof(n_task));
@@ -1574,7 +1584,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             return;
         }
 
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1671,6 +1681,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     if (params.logits_file.empty()) {
         LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
@@ -1764,7 +1776,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 2439022a2..a907ea076 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -82,8 +82,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 }
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_cache_clear(kv);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 92a49eb74..8e2c174a9 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -756,7 +756,8 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                            std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
+    const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get());
+    const bool is_first = llama_kv_cache_used_cells(kv) == 0;
 
     const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
     prompt_tokens.resize(n_prompt_tokens);
@@ -771,8 +772,10 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
+    llama_kv_cache * kv = llama_get_kv_cache(ctx.get());
+
     const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
+    const int n_ctx_used = llama_kv_cache_used_cells(kv);
     if (n_ctx_used + batch.n_tokens > n_ctx) {
         printf("\033[0m\n");
         printe("context size exceeded\n");
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index cf7cbd815..3839fbe8c 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -156,6 +156,8 @@ int main(int argc, char ** argv) {
     // make new context
     llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
 
+    llama_kv_cache * kv3 = llama_get_kv_cache(ctx3);
+
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
     llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
@@ -196,7 +198,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
 
         // erase whole kv
-        llama_kv_cache_clear(ctx3);
+        llama_kv_cache_clear(kv3);
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b1cde2d7f..076044d39 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1693,6 +1693,7 @@ struct server_context {
 
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
+    llama_kv_cache * kv = nullptr;
 
     const llama_vocab * vocab = nullptr;
 
@@ -1755,6 +1756,8 @@ struct server_context {
             return false;
         }
 
+        kv = llama_get_kv_cache(ctx);
+
         vocab = llama_model_get_vocab(model);
 
         n_ctx = llama_n_ctx(ctx);
@@ -2023,7 +2026,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_cache_clear(kv);
         clean_kv_cache = false;
     }
 
@@ -2565,8 +2568,8 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
-                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
+                    res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv);
+                    res->kv_cache_used_cells   = llama_kv_cache_used_cells(kv);
 
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@@ -2682,7 +2685,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
+                    llama_kv_cache_seq_rm(kv, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2750,8 +2753,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_cache_seq_rm (kv, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -2938,8 +2941,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift);
+                                            llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c);
+                                            llama_kv_cache_seq_add(kv, slot.id, head_c, -1,     kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -2977,9 +2980,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
+                        llama_kv_cache_seq_rm(kv, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3219,7 +3222,7 @@ struct server_context {
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
-                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index c5534cc13..130e326b5 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -88,6 +88,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    const llama_kv_cache * kv = llama_get_kv_cache(ctx);
+
     // initialize the sampler
     llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
     llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
@@ -98,7 +100,7 @@ int main(int argc, char ** argv) {
     auto generate = [&](const std::string & prompt) {
         std::string response;
 
-        const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
+        const bool is_first = llama_kv_cache_used_cells(kv) == 0;
 
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +115,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
+            int n_ctx_used = llama_kv_cache_used_cells(kv);
             if (n_ctx_used + batch.n_tokens > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 403ba2dd2..24bdc806d 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -45,6 +45,8 @@ int main(int argc, char ** argv) {
     model_tgt = llama_init_tgt.model.get();
     ctx_tgt   = llama_init_tgt.context.get();
 
+    llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt);
+
     const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
 
     // load the draft model
@@ -217,7 +219,7 @@ int main(int argc, char ** argv) {
         {
             LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
 
-            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+            llama_kv_cache_seq_rm(kv, 0, n_past, -1);
         }
 
         if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index c7ccea50d..b4e5259b5 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -90,6 +90,9 @@ int main(int argc, char ** argv) {
     model_dft = llama_init_dft.model.get();
     ctx_dft   = llama_init_dft.context.get();
 
+    llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt);
+    llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft);
+
     const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
     const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
 
@@ -420,14 +423,14 @@ int main(int argc, char ** argv) {
             {
                 LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
-                llama_kv_cache_seq_keep(ctx_dft, s_keep);
-                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_dft, 0);
+                llama_kv_cache_seq_keep(kv_dft, s_keep);
+                llama_kv_cache_seq_cp  (kv_dft, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(kv_dft, 0);
 
-                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, 0);
+                llama_kv_cache_seq_rm  (kv_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_cache_seq_keep(kv_tgt, s_keep);
+                llama_kv_cache_seq_cp  (kv_tgt, s_keep, 0, -1, -1);
+                llama_kv_cache_seq_keep(kv_tgt, 0);
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
@@ -444,8 +447,8 @@ int main(int argc, char ** argv) {
             common_batch_clear(batch_dft);
             common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1);
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str());
             llama_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
@@ -503,8 +506,8 @@ int main(int argc, char ** argv) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                         LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_rm(kv_dft,    n_seq_cur, -1, -1);
+                        llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -585,9 +588,9 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            llama_kv_cache_seq_keep(kv_tgt, 0);
             for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+                llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1);
             }
 
             // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());

From 4d7bd03e653f24e00158ae7e819908e444a20353 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 15:50:39 +0200
Subject: [PATCH 03/28] kv_cache : functions -> members

ggml-ci
---
 src/llama-context.cpp  |   2 +-
 src/llama-kv-cache.cpp | 490 ++++++++++++++++++++++++++++++++++-------
 src/llama-kv-cache.h   | 404 +++++----------------------------
 src/llama.cpp          |  16 +-
 4 files changed, 467 insertions(+), 445 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index bf5a77cca..0654feccb 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1169,7 +1169,7 @@ struct llama_data_read {
             }
             batch.n_seq_id[0] = 1;
             batch.seq_id[0] = &dest_seq_id;
-            if (!llama_kv_cache_find_slot(kv_self, batch)) {
+            if (!kv_self.find_slot(batch)) {
                 LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
                 return false;
             }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index b0d5a9318..8b2f6287b 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -11,41 +11,35 @@
 
 static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 
-uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
-    // the FA kernels require padding to avoid extra runtime boundary checks
-    return cparams.flash_attn ? 256u : 32u;
-}
-
-bool llama_kv_cache_init(
-             struct llama_kv_cache & cache,
-                 const llama_model & model,
-               const llama_cparams & cparams,
-                         ggml_type   type_k,
-                         ggml_type   type_v,
-                          uint32_t   kv_size,
-                              bool   offload) {
+bool llama_kv_cache::init(
+        const llama_model & model,
+      const llama_cparams & cparams,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                 uint32_t   kv_size,
+                     bool   offload) {
     const struct llama_hparams & hparams = model.hparams;
 
     const int32_t n_layer = hparams.n_layer;
 
-    cache.has_shift = false;
+    has_shift = false;
 
-    cache.recurrent = llama_model_is_recurrent(&model);
-    cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
-    cache.can_shift = !cache.recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+    recurrent = llama_model_is_recurrent(&model);
+    v_trans   = !recurrent && !cparams.flash_attn;
+    can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
 
     LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
-            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, cache.can_shift);
+            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
 
-    cache.head = 0;
-    cache.size = kv_size;
-    cache.used = 0;
+    head = 0;
+    size = kv_size;
+    used = 0;
 
-    cache.type_k = type_k;
-    cache.type_v = type_v;
+    type_k = type_k;
+    type_v = type_v;
 
-    cache.cells.clear();
-    cache.cells.resize(kv_size);
+    cells.clear();
+    cells.resize(kv_size);
 
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -57,19 +51,23 @@ bool llama_kv_cache_init(
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
+
             ggml_context * ctx = ggml_init(params);
             if (!ctx) {
                 return nullptr;
             }
+
             ctx_map[buft] = ctx;
-            cache.ctxs.emplace_back(ctx);
+            ctxs.emplace_back(ctx);
+
             return ctx;
         }
+
         return it->second;
     };
 
-    cache.k_l.reserve(n_layer);
-    cache.v_l.reserve(n_layer);
+    k_l.reserve(n_layer);
+    v_l.reserve(n_layer);
 
     for (int i = 0; i < n_layer; i++) {
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
@@ -95,8 +93,8 @@ bool llama_kv_cache_init(
         ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
-        cache.k_l.push_back(k);
-        cache.v_l.push_back(v);
+        k_l.push_back(k);
+        v_l.push_back(v);
     }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -111,20 +109,339 @@ bool llama_kv_cache_init(
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        cache.bufs.emplace_back(buf);
+        bufs.emplace_back(buf);
     }
 
     return true;
 }
 
-struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
+int32_t llama_kv_cache::n_tokens() const {
+    int32_t result = 0;
+
+    for (uint32_t i = 0; i < size; i++) {
+        result += cells[i].seq_id.size();
+    }
+
+    return result;
+}
+
+size_t llama_kv_cache::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+// TODO: better data structures to reduce the cost of this operation
+llama_pos llama_kv_cache::max_pos() const {
+    llama_pos max_pos = -1;
+    for (const auto & cell : cells) {
+        max_pos = std::max(max_pos, cell.pos);
+    }
+
+    return max_pos;
+}
+
+void llama_kv_cache::clear() {
+    for (int32_t i = 0; i < (int32_t) size; ++i) {
+        cells[i].pos = -1;
+        cells[i].seq_id.clear();
+        cells[i].src = -1;
+        cells[i].tail = -1;
+    }
+    head = 0;
+    used = 0;
+
+    for (auto & buf : bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (recurrent) {
+        if (seq_id >= (int64_t) size) {
+            // could be fatal
+            return false;
+        }
+        if (0 <= seq_id) {
+            int32_t & tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                const llama_kv_cell & cell = cells[tail_id];
+                // partial intersection is invalid
+                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                    return false;
+                }
+                // invalidate tails which will be cleared
+                if (p0 <= cell.pos && cell.pos < p1) {
+                    tail_id = -1;
+                }
+            }
+        } else {
+            // seq_id is negative, then the range should include everything or nothing
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                return false;
+            }
+        }
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+                cells[i].src = -1;
+
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if (recurrent) {
+        if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+            llama_kv_cell & tail_src = cells[seq_id_src];
+            llama_kv_cell & tail_dst = cells[seq_id_dst];
+            if (tail_dst.tail >= 0) {
+                // clear destination seq_id if it wasn't empty
+                llama_kv_cell & cell_dst = cells[tail_dst.tail];
+
+                cell_dst.seq_id.erase(seq_id_dst);
+                tail_dst.tail = -1;
+                if (cell_dst.seq_id.empty()) {
+                    cell_dst.pos = -1;
+                    cell_dst.delta = -1;
+                    cell_dst.src = -1;
+                    used -= 1;
+                }
+            }
+            if (tail_src.tail >= 0) {
+                llama_kv_cell & cell_src = cells[tail_src.tail];
+
+                cell_src.seq_id.insert(seq_id_dst);
+                tail_dst.tail = tail_src.tail;
+            }
+        }
+
+        return;
+    }
+
+    // otherwise, this is the KV of a Transformer-like model
+    head = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            cells[i].seq_id.insert(seq_id_dst);
+        }
+    }
+}
+
+void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = size;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (recurrent && (llama_seq_id) i != seq_id) {
+            cells[i].tail = -1;
+        }
+
+        if (!cells[i].has_seq_id(seq_id)) {
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+            cells[i].seq_id.clear();
+
+            if (new_head == size){
+                new_head = i;
+            }
+        } else {
+            cells[i].seq_id.clear();
+            cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    if (delta == 0) {
+        return;
+    }
+
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be shifted
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos += delta;
+                }
+            }
+        }
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+            cells[i].pos   += delta;
+            cells[i].delta += delta;
+
+            if (cells[i].pos < 0) {
+                if (!cells[i].is_empty()) {
+                    used--;
+                }
+                cells[i].pos = -1;
+                cells[i].seq_id.clear();
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    head = new_head != size ? new_head : 0;
+}
+
+void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be changed
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos /= d;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+
+            {
+                llama_pos p_old = cells[i].pos;
+                cells[i].pos   /= d;
+                cells[i].delta += cells[i].pos - p_old;
+            }
+        }
+    }
+}
+
+llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) {
+    llama_pos result = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+void llama_kv_cache::defrag() {
+    if (!recurrent) {
+        do_defrag = true;
+    }
+}
+
+struct llama_kv_cache_slot_info llama_kv_cache::find_slot(
        const struct llama_ubatch & ubatch) {
     const uint32_t n_tokens = ubatch.n_tokens;
     const uint32_t n_seqs   = ubatch.n_seqs;
     const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
 
-    if (cache.recurrent) {
+    if (recurrent) {
         // For recurrent state architectures (like Mamba or RWKV),
         // each cache cell can store the state for a whole sequence.
         // A slot should be always be contiguous.
@@ -132,7 +449,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         // can only process batches with an equal number of new tokens in each sequence
         GGML_ASSERT(ubatch.equal_seqs);
 
-        int32_t min = cache.size - 1;
+        int32_t min = size - 1;
         int32_t max = 0;
 
         // everything should fit if all seq_ids are smaller than the max
@@ -141,16 +458,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
             for (uint32_t j = 0; j < n_seq_id; ++j) {
                 const llama_seq_id seq_id = ubatch.seq_id[s][j];
 
-                if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
+                if (seq_id < 0 || (uint32_t) seq_id >= size) {
                     // too big seq_id
                     // TODO: would it be possible to resize the cache instead?
-                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
+                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
                     return llama_kv_cache_slot_info_failed;
                 }
                 if (j > 0) {
-                    llama_kv_cell & seq = cache.cells[seq_id];
+                    llama_kv_cell & seq = cells[seq_id];
                     if (seq.tail >= 0) {
-                        llama_kv_cell & cell = cache.cells[seq.tail];
+                        llama_kv_cell & cell = cells[seq.tail];
                         // clear cells from seq_ids that become shared
                         // (should not normally happen, but let's handle it anyway)
                         cell.seq_id.erase(seq_id);
@@ -158,7 +475,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
                         if (cell.seq_id.empty()) {
                             cell.pos = -1;
                             cell.src = -1;
-                            cache.used -= 1;
+                            used -= 1;
                         }
                     }
                 }
@@ -168,9 +485,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
 #ifndef NDEBUG
         {
             std::vector<int32_t> tails_verif;
-            tails_verif.assign(cache.size, -1);
-            for (uint32_t i = 0; i < cache.size; ++i) {
-                llama_kv_cell & cell = cache.cells[i];
+            tails_verif.assign(size, -1);
+            for (uint32_t i = 0; i < size; ++i) {
+                llama_kv_cell & cell = cells[i];
                 for (llama_seq_id seq_id : cell.seq_id) {
                     if (tails_verif[seq_id] != -1) {
                         LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
@@ -178,20 +495,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
                     tails_verif[seq_id] = i;
                 }
             }
-            for (uint32_t i = 0; i < cache.size; ++i) {
-                if (tails_verif[i] != cache.cells[i].tail) {
-                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
+            for (uint32_t i = 0; i < size; ++i) {
+                if (tails_verif[i] != cells[i].tail) {
+                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
                 }
             }
         }
 #endif
 
         // find next empty cell
-        uint32_t next_empty_cell = cache.head;
+        uint32_t next_empty_cell = head;
 
-        for (uint32_t i = 0; i < cache.size; ++i) {
-            if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
-            llama_kv_cell & cell = cache.cells[next_empty_cell];
+        for (uint32_t i = 0; i < size; ++i) {
+            if (next_empty_cell >= size) { next_empty_cell -= size; }
+            llama_kv_cell & cell = cells[next_empty_cell];
             if (cell.is_empty()) { break; }
             next_empty_cell += 1;
         }
@@ -199,20 +516,20 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         // find usable cell range
         for (uint32_t s = 0; s < n_seqs; ++s) {
             const llama_seq_id seq_id = ubatch.seq_id[s][0];
-            llama_kv_cell & seq_meta = cache.cells[seq_id];
+            llama_kv_cell & seq_meta = cells[seq_id];
             bool has_cell = false;
             if (seq_meta.tail >= 0) {
-                llama_kv_cell & cell = cache.cells[seq_meta.tail];
+                llama_kv_cell & cell = cells[seq_meta.tail];
                 GGML_ASSERT(cell.has_seq_id(seq_id));
                 // does this seq_id "own" the cell?
                 if (cell.seq_id.size() == 1) { has_cell = true; }
             }
             if (!has_cell) {
-                llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
+                llama_kv_cell & empty_cell = cells[next_empty_cell];
                 GGML_ASSERT(empty_cell.is_empty());
                 // copy old tail into the empty cell
                 if (seq_meta.tail >= 0) {
-                    llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
+                    llama_kv_cell & orig_cell = cells[seq_meta.tail];
                     empty_cell.pos = orig_cell.pos;
                     empty_cell.src = orig_cell.src;
                     orig_cell.seq_id.erase(seq_id);
@@ -222,9 +539,9 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
                 // find next empty cell
                 if (s + 1 < n_seqs) {
                     next_empty_cell += 1;
-                    for (uint32_t i = 0; i < cache.size; ++i) {
-                        if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
-                        llama_kv_cell & cell = cache.cells[next_empty_cell];
+                    for (uint32_t i = 0; i < size; ++i) {
+                        if (next_empty_cell >= size) { next_empty_cell -= size; }
+                        llama_kv_cell & cell = cells[next_empty_cell];
                         if (cell.is_empty()) { break; }
                         next_empty_cell += 1;
                     }
@@ -237,10 +554,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         // gather and re-order
         for (uint32_t s = 0; s < n_seqs; ++s) {
             int32_t dst_id = s + min;
-            int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
+            int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
             if (dst_id != src_id) {
-                llama_kv_cell & dst_cell = cache.cells[dst_id];
-                llama_kv_cell & src_cell = cache.cells[src_id];
+                llama_kv_cell & dst_cell = cells[dst_id];
+                llama_kv_cell & src_cell = cells[src_id];
 
                 std::swap(dst_cell.pos, src_cell.pos);
                 std::swap(dst_cell.src, src_cell.src);
@@ -248,10 +565,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
 
                 // swap tails (assuming they NEVER overlap)
                 for (const llama_seq_id seq_id : src_cell.seq_id) {
-                    cache.cells[seq_id].tail = src_id;
+                    cells[seq_id].tail = src_id;
                 }
                 for (const llama_seq_id seq_id : dst_cell.seq_id) {
-                    cache.cells[seq_id].tail = dst_id;
+                    cells[seq_id].tail = dst_id;
                 }
             }
         }
@@ -260,7 +577,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
         for (uint32_t s = 0; s < n_seqs; ++s) {
             const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
             int32_t cell_id = s + min;
-            llama_kv_cell & cell = cache.cells[cell_id];
+            llama_kv_cell & cell = cells[cell_id];
 
             if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
                 // What should happen when the pos backtracks or skips a value?
@@ -273,41 +590,41 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
             for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
                 const llama_seq_id seq_id = ubatch.seq_id[s][j];
                 cell.seq_id.insert(seq_id);
-                cache.cells[seq_id].tail = cell_id;
+                cells[seq_id].tail = cell_id;
             }
         }
 
         // allow getting the range of used cells, from head to head + n
-        cache.head = min;
-        cache.n    = max - min + 1;
-        cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
+        head = min;
+        n    = max - min + 1;
+        used = std::count_if(cells.begin(), cells.end(),
             [](const llama_kv_cell& cell){ return !cell.is_empty(); });
 
         // sanity check
-        return llama_kv_cache_slot_info(cache.n >= n_seqs);
+        return llama_kv_cache_slot_info(n >= n_seqs);
     }
     // otherwise, one cell per token.
 
-    if (n_tokens > cache.size) {
-        LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
+    if (n_tokens > size) {
+        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
         return llama_kv_cache_slot_info_failed;
     }
 
     uint32_t n_tested = 0;
 
     while (true) {
-        if (cache.head + n_tokens > cache.size) {
-            n_tested += cache.size - cache.head;
-            cache.head = 0;
+        if (head + n_tokens > size) {
+            n_tested += size - head;
+            head = 0;
             continue;
         }
 
         bool found = true;
         for (uint32_t i = 0; i < n_tokens; i++) {
-            if (cache.cells[cache.head + i].pos >= 0) {
+            if (cells[head + i].pos >= 0) {
                 found = false;
-                cache.head += i + 1;
-                n_tested   += i + 1;
+                head     += i + 1;
+                n_tested += i + 1;
                 break;
             }
         }
@@ -316,7 +633,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
             break;
         }
 
-        if (n_tested >= cache.size) {
+        if (n_tested >= size) {
             //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
             return llama_kv_cache_slot_info_failed;
         }
@@ -325,22 +642,27 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
     for (uint32_t s = 0; s < n_seqs; s++) {
         for (uint32_t i = 0; i < n_seq_tokens; ++i) {
             uint32_t k = s*n_seq_tokens + i;
-            cache.cells[cache.head + k].pos = ubatch.pos[k];
+            cells[head + k].pos = ubatch.pos[k];
 
             for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
-                cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
+                cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
             }
         }
     }
 
-    cache.used += n_tokens;
+    used += n_tokens;
 
-    return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
+    return llama_kv_cache_slot_info(head, head + n_tokens);
 }
 
-uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
-    for (uint32_t i = cache.size; i > 0; --i) {
-        const llama_kv_cell & cell = cache.cells[i - 1];
+uint32_t llama_kv_cache::get_padding(const llama_cparams & cparams) const {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
+
+uint32_t llama_kv_cache::cell_max() const {
+    for (uint32_t i = size; i > 0; --i) {
+        const llama_kv_cell & cell = cells[i - 1];
 
         if (cell.pos >= 0 && !cell.is_empty()) {
             return i;
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index b0bb1cfb1..4ee3418d8 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -7,6 +7,9 @@
 #include <set>
 #include <vector>
 
+struct llama_cparams;
+struct llama_ubatch;
+
 struct llama_kv_cell {
     llama_pos pos   = -1;
     llama_pos delta = 0;
@@ -28,7 +31,19 @@ struct llama_kv_cell {
     }
 };
 
+// a structure holds information about the slot found in llama_kv_cache_find_slot
+struct llama_kv_cache_slot_info {
+    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+    bool found = false;                       // the slot was found
+
+    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
+    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
+
+    operator bool() const { return found; }
+};
+
 // ring-buffer of cached KV data
+// TODO: pimpl
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
@@ -57,343 +72,8 @@ struct llama_kv_cache {
     std::vector<ggml_context_ptr> ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
 
-    int32_t n_tokens() const {
-        int32_t result = 0;
-
-        for (uint32_t i = 0; i < size; i++) {
-            result += cells[i].seq_id.size();
-        }
-
-        return result;
-    }
-
-    size_t total_size() const {
-        size_t size = 0;
-        for (const auto & buf : bufs) {
-            size += ggml_backend_buffer_get_size(buf.get());
-        }
-
-        return size;
-    }
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos max_pos() const {
-        llama_pos max_pos = -1;
-        for (const auto & cell : cells) {
-            max_pos = std::max(max_pos, cell.pos);
-        }
-
-        return max_pos;
-    }
-
-    void clear() {
-        for (int32_t i = 0; i < (int32_t) size; ++i) {
-            cells[i].pos = -1;
-            cells[i].seq_id.clear();
-            cells[i].src = -1;
-            cells[i].tail = -1;
-        }
-        head = 0;
-        used = 0;
-
-        for (auto & buf : bufs) {
-            ggml_backend_buffer_clear(buf.get(), 0);
-        }
-    }
-
-    bool seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-        uint32_t new_head = size;
-
-        if (p0 < 0) {
-            p0 = 0;
-        }
-
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
-
-        // models like Mamba or RWKV can't have a state partially erased
-        if (recurrent) {
-            if (seq_id >= (int64_t) size) {
-                // could be fatal
-                return false;
-            }
-            if (0 <= seq_id) {
-                int32_t & tail_id = cells[seq_id].tail;
-                if (tail_id >= 0) {
-                    const llama_kv_cell & cell = cells[tail_id];
-                    // partial intersection is invalid
-                    if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
-                        return false;
-                    }
-                    // invalidate tails which will be cleared
-                    if (p0 <= cell.pos && cell.pos < p1) {
-                        tail_id = -1;
-                    }
-                }
-            } else {
-                // seq_id is negative, then the range should include everything or nothing
-                if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
-                    return false;
-                }
-            }
-        }
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].pos >= p0 && cells[i].pos < p1) {
-                if (seq_id < 0) {
-                    cells[i].seq_id.clear();
-                } else if (cells[i].has_seq_id(seq_id)) {
-                    cells[i].seq_id.erase(seq_id);
-                } else {
-                    continue;
-                }
-                if (cells[i].is_empty()) {
-                    // keep count of the number of used cells
-                    if (cells[i].pos >= 0) {
-                        used--;
-                    }
-
-                    cells[i].pos = -1;
-                    cells[i].src = -1;
-
-                    if (new_head == size) {
-                        new_head = i;
-                    }
-                }
-            }
-        }
-
-        // If we freed up a slot, set head to it so searching can start there.
-        if (new_head != size && new_head < head) {
-            head = new_head;
-        }
-
-        return true;
-    }
-
-    void seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-        if (seq_id_src == seq_id_dst) {
-            return;
-        }
-
-        if (p0 < 0) {
-            p0 = 0;
-        }
-
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
-
-        if (recurrent) {
-            if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
-                llama_kv_cell & tail_src = cells[seq_id_src];
-                llama_kv_cell & tail_dst = cells[seq_id_dst];
-                if (tail_dst.tail >= 0) {
-                    // clear destination seq_id if it wasn't empty
-                    llama_kv_cell & cell_dst = cells[tail_dst.tail];
-
-                    cell_dst.seq_id.erase(seq_id_dst);
-                    tail_dst.tail = -1;
-                    if (cell_dst.seq_id.empty()) {
-                        cell_dst.pos = -1;
-                        cell_dst.delta = -1;
-                        cell_dst.src = -1;
-                        used -= 1;
-                    }
-                }
-                if (tail_src.tail >= 0) {
-                    llama_kv_cell & cell_src = cells[tail_src.tail];
-
-                    cell_src.seq_id.insert(seq_id_dst);
-                    tail_dst.tail = tail_src.tail;
-                }
-            }
-
-            return;
-        }
-
-        // otherwise, this is the KV of a Transformer-like model
-        head = 0;
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
-                cells[i].seq_id.insert(seq_id_dst);
-            }
-        }
-    }
-
-    void seq_keep(llama_seq_id seq_id) {
-        uint32_t new_head = size;
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (recurrent && (llama_seq_id) i != seq_id) {
-                cells[i].tail = -1;
-            }
-
-            if (!cells[i].has_seq_id(seq_id)) {
-                if (cells[i].pos >= 0) {
-                    used--;
-                }
-
-                cells[i].pos = -1;
-                cells[i].src = -1;
-                cells[i].seq_id.clear();
-
-                if (new_head == size){
-                    new_head = i;
-                }
-            } else {
-                cells[i].seq_id.clear();
-                cells[i].seq_id.insert(seq_id);
-            }
-        }
-
-        // If we freed up a slot, set head to it so searching can start there.
-        if (new_head != size && new_head < head) {
-            head = new_head;
-        }
-    }
-
-    void seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-        if (delta == 0) {
-            return;
-        }
-
-        uint32_t new_head = size;
-
-        if (p0 < 0) {
-            p0 = 0;
-        }
-
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
-
-        // If there is no range then return early to avoid looping over the
-        if (p0 == p1) {
-            return;
-        }
-
-        if (recurrent) {
-            // for Mamba-like or RWKV models, only the pos needs to be shifted
-            if (0 <= seq_id && seq_id < (int64_t) size) {
-                const int32_t tail_id = cells[seq_id].tail;
-                if (tail_id >= 0) {
-                    llama_kv_cell & cell = cells[tail_id];
-                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                        cell.pos += delta;
-                    }
-                }
-            }
-            return;
-        }
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
-                has_shift = true;
-                cells[i].pos   += delta;
-                cells[i].delta += delta;
-
-                if (cells[i].pos < 0) {
-                    if (!cells[i].is_empty()) {
-                        used--;
-                    }
-                    cells[i].pos = -1;
-                    cells[i].seq_id.clear();
-                    if (new_head == size) {
-                        new_head = i;
-                    }
-                }
-            }
-        }
-
-        // If we freed up a slot, set head to it so searching can start there.
-        // Otherwise we just start the next search from the beginning.
-        head = new_head != size ? new_head : 0;
-    }
-
-    void seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
-        if (d == 1) {
-            return;
-        }
-
-        if (p0 < 0) {
-            p0 = 0;
-        }
-
-        if (p1 < 0) {
-            p1 = std::numeric_limits<llama_pos>::max();
-        }
-
-        // If there is no range then return early to avoid looping over the cache.
-        if (p0 == p1) {
-            return;
-        }
-
-        if (recurrent) {
-            // for Mamba-like or RWKV models, only the pos needs to be changed
-            if (0 <= seq_id && seq_id < (int64_t) size) {
-                const int32_t tail_id = cells[seq_id].tail;
-                if (tail_id >= 0) {
-                    llama_kv_cell & cell = cells[tail_id];
-                    if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
-                        cell.pos /= d;
-                    }
-                }
-            }
-
-            return;
-        }
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
-                has_shift = true;
-
-                {
-                    llama_pos p_old = cells[i].pos;
-                    cells[i].pos   /= d;
-                    cells[i].delta += cells[i].pos - p_old;
-                }
-            }
-        }
-    }
-
-    llama_pos seq_pos_max(llama_seq_id seq_id) {
-        llama_pos result = 0;
-
-        for (uint32_t i = 0; i < size; ++i) {
-            if (cells[i].has_seq_id(seq_id)) {
-                result = std::max(result, cells[i].pos);
-            }
-        }
-
-        return result;
-    }
-
-    void defrag() {
-        if (!recurrent) {
-            do_defrag = true;
-        }
-    }
-};
-
-// a structure holds information about the slot found in llama_kv_cache_find_slot
-struct llama_kv_cache_slot_info {
-    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
-    bool found = false;                       // the slot was found
-
-    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
-    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
-
-    operator bool() const { return found; }
-};
-
-// TODO: maybe not needed
-uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
-
-bool llama_kv_cache_init(
-        struct llama_kv_cache & cache,
+    // TODO: become constructor
+    bool init(
             const llama_model & model,
           const llama_cparams & cparams,
                     ggml_type   type_k,
@@ -401,25 +81,38 @@ bool llama_kv_cache_init(
                      uint32_t   kv_size,
                          bool   offload);
 
-// find an empty slot of size "n_tokens" in the cache
-// updates the cache head
-// returns a structure holding information about the slot found
-// Note: On success, it's important that cache.head points
-// to the first cell of the slot.
-struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
-           struct llama_kv_cache & cache,
-       const struct llama_ubatch & batch);
+    int32_t n_tokens() const;
 
-// find how many cells are currently in use
-uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
+    size_t total_size() const;
 
-//
-// kv cache view
-//
+    // TODO: better data structures to reduce the cost of this operation
+    llama_pos max_pos() const;
 
-struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
+    void clear();
 
-void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
+    bool seq_rm  (llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
+    void seq_keep(llama_seq_id seq_id);
+    void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+    void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d);
+
+    llama_pos seq_pos_max(llama_seq_id seq_id);
+
+    void defrag();
+
+    // find an empty slot of size "n_tokens" in the cache
+    // updates the cache head
+    // returns a structure holding information about the slot found
+    // Note: On success, it's important that cache.head points
+    // to the first cell of the slot.
+    llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
+
+    // TODO: maybe not needed
+    uint32_t get_padding(const llama_cparams & cparams) const;
+
+    // find how many cells are currently in use
+    uint32_t cell_max() const;
+};
 
 //
 // kv cache restore
@@ -472,3 +165,10 @@ struct llama_kv_slot_restorer {
     }
 };
 
+//
+// kv cache view
+//
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
+
+void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
diff --git a/src/llama.cpp b/src/llama.cpp
index 87dd512b2..d8427af9d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8572,18 +8572,18 @@ static int llama_decode_impl(
                 kv_self.head = 0;
             }
 
-            const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
-            if (!slot) {
+            const auto slot_info = kv_self.find_slot(ubatch);
+            if (!slot_info) {
                 return 1;
             }
-            kv_slot_restorer.save(slot);
+            kv_slot_restorer.save(slot_info);
 
             if (!kv_self.recurrent) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
                 // after enough generations, the benefit from this heuristic disappears
                 // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = llama_kv_cache_get_padding(cparams);
-                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
+                const uint32_t pad = kv_self.get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
                 //kv_self.n = llama_kv_cache_cell_max(kv_self);
             }
         }
@@ -8969,7 +8969,7 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
 
     const uint32_t n_layer = hparams.n_layer;
 
-    const uint32_t n_kv   = llama_kv_cache_cell_max(kv_self);
+    const uint32_t n_kv   = kv_self.cell_max();
     const uint32_t n_used = kv_self.used;
 
     assert(n_used <= n_kv);
@@ -9550,7 +9550,7 @@ struct llama_context * llama_init_from_model(
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
     // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams));
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -9692,7 +9692,7 @@ struct llama_context * llama_init_from_model(
 
         llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
 
-        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+        if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;

From fef90cb3d7a823bd00a7899b52ffc70a4f824d44 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Jan 2025 15:58:20 +0200
Subject: [PATCH 04/28] kv_cache : fix

ggml-ci
---
 src/llama-kv-cache.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 8b2f6287b..fe5986768 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -35,8 +35,8 @@ bool llama_kv_cache::init(
     size = kv_size;
     used = 0;
 
-    type_k = type_k;
-    type_v = type_v;
+    this->type_k = type_k;
+    this->type_v = type_v;
 
     cells.clear();
     cells.resize(kv_size);

From 73a14eccc9f200d6012963af9448042dfeac54fc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 11:56:53 +0200
Subject: [PATCH 05/28] kv_cache : minor

---
 src/llama-kv-cache.cpp | 38 +++++++++++++++++++++++++++++++-------
 src/llama-kv-cache.h   | 18 +++++++++++-------
 src/llama.cpp          | 18 +++++-------------
 3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index fe5986768..9f3b4e514 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -73,17 +73,22 @@ bool llama_kv_cache::init(
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
-        LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
+        const char * dev_name = "CPU";
 
         ggml_backend_buffer_type_t buft;
         if (offload) {
             auto * dev = model.dev_layer(i);
             buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
         } else {
             buft = ggml_backend_cpu_buffer_type();
         }
-        ggml_context * ctx = ctx_for_buft(buft);
 
+        LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
+                i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+
+        ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
             LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
             return false;
@@ -134,14 +139,13 @@ size_t llama_kv_cache::total_size() const {
     return size;
 }
 
-// TODO: better data structures to reduce the cost of this operation
-llama_pos llama_kv_cache::max_pos() const {
-    llama_pos max_pos = -1;
+llama_pos llama_kv_cache::pos_max() const {
+    llama_pos pos_max = -1;
     for (const auto & cell : cells) {
-        max_pos = std::max(max_pos, cell.pos);
+        pos_max = std::max(pos_max, cell.pos);
     }
 
-    return max_pos;
+    return pos_max;
 }
 
 void llama_kv_cache::clear() {
@@ -672,6 +676,26 @@ uint32_t llama_kv_cache::cell_max() const {
     return 0;
 }
 
+size_t llama_kv_cache::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & k : k_l) {
+        size_k_bytes += ggml_nbytes(k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & v : v_l) {
+        size_v_bytes += ggml_nbytes(v);
+    }
+
+    return size_v_bytes;
+}
+
 void llama_kv_cache_clear(llama_kv_cache * kv) {
     kv->clear();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 4ee3418d8..97285481e 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -61,17 +61,11 @@ struct llama_kv_cache {
     // computed before each graph build
     uint32_t n = 0;
 
-    ggml_type type_k = GGML_TYPE_F16;
-    ggml_type type_v = GGML_TYPE_F16;
-
     std::vector<llama_kv_cell> cells;
 
     std::vector<struct ggml_tensor *> k_l; // per layer
     std::vector<struct ggml_tensor *> v_l;
 
-    std::vector<ggml_context_ptr> ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-
     // TODO: become constructor
     bool init(
             const llama_model & model,
@@ -86,7 +80,7 @@ struct llama_kv_cache {
     size_t total_size() const;
 
     // TODO: better data structures to reduce the cost of this operation
-    llama_pos max_pos() const;
+    llama_pos pos_max() const;
 
     void clear();
 
@@ -112,6 +106,16 @@ struct llama_kv_cache {
 
     // find how many cells are currently in use
     uint32_t cell_max() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+private:
+    ggml_type type_k = GGML_TYPE_F16;
+    ggml_type type_v = GGML_TYPE_F16;
+
+    std::vector<ggml_context_ptr> ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
 };
 
 //
diff --git a/src/llama.cpp b/src/llama.cpp
index d8427af9d..0227ba6b3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1973,7 +1973,7 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur   = ggml_get_rows(ctx0,      cur, inp_out_ids);
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
@@ -8456,7 +8456,7 @@ static int llama_decode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens_all = batch.n_tokens;
@@ -8792,7 +8792,7 @@ static int llama_encode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens = batch.n_tokens;
@@ -9699,16 +9699,8 @@ struct llama_context * llama_init_from_model(
         }
 
         {
-            size_t memory_size_k = 0;
-            size_t memory_size_v = 0;
-
-            for (auto & k : ctx->kv_self.k_l) {
-                memory_size_k += ggml_nbytes(k);
-            }
-
-            for (auto & v : ctx->kv_self.v_l) {
-                memory_size_v += ggml_nbytes(v);
-            }
+            const size_t memory_size_k = ctx->kv_self.size_k_bytes();
+            const size_t memory_size_v = ctx->kv_self.size_v_bytes();
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                       (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),

From 4cd1b6fa4cc4e8da927caac5c61b9fcd096a1ace Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 12:33:13 +0200
Subject: [PATCH 06/28] context : prepare kv_cache_read/write to be moved to
 kv_cache

ggml-ci
---
 src/llama-context.cpp | 153 +++++++++++++++++++++---------------------
 src/llama-kv-cache.h  |   1 +
 2 files changed, 76 insertions(+), 78 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0654feccb..8fc6de2f2 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -928,11 +928,8 @@ struct llama_data_write {
         }
     }
 
-    void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
-        const struct llama_kv_cache & kv_self = ctx->kv_self;
-        const struct llama_hparams & hparams = ctx->model.hparams;
-
-        const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
+    void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
+        const uint32_t v_trans = kv.v_trans ? 1 : 0;
         const uint32_t n_layer = hparams.n_layer;
 
         write(&v_trans, sizeof(v_trans));
@@ -946,52 +943,52 @@ struct llama_data_write {
             const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
 
             // Write key type
-            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
             write(&k_type_i, sizeof(k_type_i));
 
             // Write row size of key
-            const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
             write(&k_size_row, sizeof(k_size_row));
 
             // Read each range of cells of k_size length each into tmp_buf and write out
             for (const auto & range : cell_ranges) {
                 const size_t range_size = range.second - range.first;
                 const size_t buf_size = range_size * k_size_row;
-                write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
+                write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size);
             }
         }
 
-        if (!kv_self.v_trans) {
+        if (!kv.v_trans) {
             for (uint32_t il = 0; il < n_layer; ++il) {
                 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
                 // Write value type
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 write(&v_type_i, sizeof(v_type_i));
 
                 // Write row size of value
-                const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
                 write(&v_size_row, sizeof(v_size_row));
 
                 // Read each range of cells of v_size length each into tmp_buf and write out
                 for (const auto & range : cell_ranges) {
                     const size_t range_size = range.second - range.first;
                     const size_t buf_size = range_size * v_size_row;
-                    write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
+                    write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size);
                 }
             }
         } else {
             // When v is transposed, we also need the element size and get the element ranges from each row
-            const uint32_t kv_size = kv_self.size;
+            const uint32_t kv_size = kv.size;
             for (uint32_t il = 0; il < n_layer; ++il) {
                 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
                 // Write value type
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 write(&v_type_i, sizeof(v_type_i));
 
                 // Write element size
-                const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type);
                 write(&v_size_el, sizeof(v_size_el));
 
                 // Write GQA embedding size
@@ -1004,37 +1001,36 @@ struct llama_data_write {
                         const size_t range_size = range.second - range.first;
                         const size_t src_offset = (range.first + j * kv_size) * v_size_el;
                         const size_t buf_size = range_size * v_size_el;
-                        write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
+                        write_tensor_data(kv.v_l[il], src_offset, buf_size);
                     }
                 }
             }
         }
     }
 
-    void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
-        const struct llama_kv_cache & kv_self = ctx->kv_self;
+    void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
         std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
         uint32_t cell_count = 0;
 
         // Count the number of cells with the specified seq_id
         // Find all the ranges of cells with this seq id (or all, when -1)
-        uint32_t cell_range_begin = kv_self.size;
-        for (uint32_t i = 0; i < kv_self.size; ++i) {
-            const auto & cell = kv_self.cells[i];
+        uint32_t cell_range_begin = kv.size;
+        for (uint32_t i = 0; i < kv.size; ++i) {
+            const auto & cell = kv.cells[i];
             if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
                 ++cell_count;
-                if (cell_range_begin == kv_self.size) {
+                if (cell_range_begin == kv.size) {
                     cell_range_begin = i;
                 }
             } else {
-                if (cell_range_begin != kv_self.size) {
+                if (cell_range_begin != kv.size) {
                     cell_ranges.emplace_back(cell_range_begin, i);
-                    cell_range_begin = kv_self.size;
+                    cell_range_begin = kv.size;
                 }
             }
         }
-        if (cell_range_begin != kv_self.size) {
-            cell_ranges.emplace_back(cell_range_begin, kv_self.size);
+        if (cell_range_begin != kv.size) {
+            cell_ranges.emplace_back(cell_range_begin, kv.size);
         }
 
         // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
@@ -1046,8 +1042,8 @@ struct llama_data_write {
 
         write(&cell_count, sizeof(cell_count));
 
-        write_kv_cache_meta(kv_self, cell_ranges, seq_id);
-        write_kv_cache_data(ctx, cell_ranges);
+        write_kv_cache_meta(kv, cell_ranges, seq_id);
+        write_kv_cache_data(kv, hparams, cell_ranges);
     }
 };
 
@@ -1140,15 +1136,15 @@ struct llama_data_read {
         }
     }
 
-    bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
-        struct llama_kv_cache & kv_self = ctx->kv_self;
-
+    bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
         if (dest_seq_id != -1) {
             // single sequence
 
-            kv_self.seq_rm(dest_seq_id, -1, -1);
+            kv.seq_rm(dest_seq_id, -1, -1);
+
+            llama_sbatch sbatch;
+            llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
 
-            llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
             batch.n_tokens = cell_count;
             batch.n_seq_tokens = cell_count;
             batch.n_seqs = 1;
@@ -1157,7 +1153,7 @@ struct llama_data_read {
                 llama_pos pos;
                 uint32_t n_seq_id;
 
-                read_to(&pos, sizeof(pos));
+                read_to(&pos,      sizeof(pos));
                 read_to(&n_seq_id, sizeof(n_seq_id));
 
                 if (n_seq_id != 0) {
@@ -1169,30 +1165,30 @@ struct llama_data_read {
             }
             batch.n_seq_id[0] = 1;
             batch.seq_id[0] = &dest_seq_id;
-            if (!kv_self.find_slot(batch)) {
+            if (!kv.find_slot(batch)) {
                 LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
                 return false;
             }
 
-            // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+            // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
             // Assume that this is one contiguous block of cells
-            GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
-            GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
-            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-            GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
-            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
+            GGML_ASSERT(kv.head + cell_count <= kv.size);
+            GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]);
+            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+            GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id));
+            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id));
         } else {
             // whole KV cache restore
 
-            if (cell_count > kv_self.size) {
+            if (cell_count > kv.size) {
                 LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
                 return false;
             }
 
-            kv_self.clear();
+            kv.clear();
 
             for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_kv_cell & cell = kv_self.cells[i];
+                llama_kv_cell & cell = kv.cells[i];
 
                 llama_pos pos;
                 uint32_t  n_seq_id;
@@ -1206,15 +1202,18 @@ struct llama_data_read {
                     llama_seq_id seq_id;
                     read_to(&seq_id, sizeof(seq_id));
 
-                    if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    // TODO: llama_kv_cache should have a notion of max sequences
+                    //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                    if (seq_id < 0) {
+                        //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
                         return false;
                     }
 
                     cell.seq_id.insert(seq_id);
 
-                    if (kv_self.recurrent) {
-                        int32_t & tail = kv_self.cells[seq_id].tail;
+                    if (kv.recurrent) {
+                        int32_t & tail = kv.cells[seq_id].tail;
                         if (tail != -1) {
                             LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
                             return false;
@@ -1224,24 +1223,22 @@ struct llama_data_read {
                 }
             }
 
-            kv_self.head = 0;
-            kv_self.used = cell_count;
+            kv.head = 0;
+            kv.used = cell_count;
         }
 
-        if (kv_self.recurrent) {
+        if (kv.recurrent) {
             for (uint32_t i = 0; i < cell_count; ++i) {
-                uint32_t cell_id = kv_self.head + i;
+                uint32_t cell_id = kv.head + i;
                 // make sure the recurrent states will keep their restored state
-                kv_self.cells[cell_id].src = cell_id;
+                kv.cells[cell_id].src = cell_id;
             }
         }
 
         return true;
     }
 
-    bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
-        const struct llama_hparams & hparams = ctx->model.hparams;
-        struct llama_kv_cache & kv_self = ctx->kv_self;
+    bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) {
         uint32_t v_trans;
         uint32_t n_layer;
         read_to(&v_trans, sizeof(v_trans));
@@ -1251,11 +1248,11 @@ struct llama_data_read {
             LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
             return false;
         }
-        if (cell_count > kv_self.size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
+        if (cell_count > kv.size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size);
             return false;
         }
-        if (kv_self.v_trans != (bool) v_trans) {
+        if (kv.v_trans != (bool) v_trans) {
             LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
             return false;
         }
@@ -1267,7 +1264,7 @@ struct llama_data_read {
             // Read type of key
             int32_t k_type_i_ref;
             read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
             if (k_type_i != k_type_i_ref) {
                 LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
                 return false;
@@ -1276,7 +1273,7 @@ struct llama_data_read {
             // Read row size of key
             uint64_t k_size_row_ref;
             read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-            const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
             if (k_size_row != k_size_row_ref) {
                 LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
                 return false;
@@ -1284,18 +1281,18 @@ struct llama_data_read {
 
             if (cell_count) {
                 // Read and set the keys for the whole cell range
-                ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
+                ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row);
             }
         }
 
-        if (!kv_self.v_trans) {
+        if (!kv.v_trans) {
             for (uint32_t il = 0; il < n_layer; ++il) {
                 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
                 // Read type of value
                 int32_t v_type_i_ref;
                 read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 if (v_type_i != v_type_i_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                     return false;
@@ -1304,7 +1301,7 @@ struct llama_data_read {
                 // Read row size of value
                 uint64_t v_size_row_ref;
                 read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-                const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
                 if (v_size_row != v_size_row_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
                     return false;
@@ -1312,7 +1309,7 @@ struct llama_data_read {
 
                 if (cell_count) {
                     // Read and set the values for the whole cell range
-                    ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
+                    ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row);
                 }
             }
         } else {
@@ -1323,7 +1320,7 @@ struct llama_data_read {
                 // Read type of value
                 int32_t v_type_i_ref;
                 read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
                 if (v_type_i != v_type_i_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                     return false;
@@ -1332,7 +1329,7 @@ struct llama_data_read {
                 // Read element size of value
                 uint32_t v_size_el_ref;
                 read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-                const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                const size_t v_size_el = ggml_type_size(kv.v_l[il]->type);
                 if (v_size_el != v_size_el_ref) {
                     LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
                     return false;
@@ -1349,8 +1346,8 @@ struct llama_data_read {
                 if (cell_count) {
                     // For each row in the transposed matrix, read the values for the whole cell range
                     for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                        const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
-                        ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                        const size_t dst_offset = (kv.head + j * kv.size) * v_size_el;
+                        ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
                     }
                 }
             }
@@ -1358,17 +1355,17 @@ struct llama_data_read {
         return true;
     }
 
-    void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
+    void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
         uint32_t cell_count;
         read_to(&cell_count, sizeof(cell_count));
 
-        bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
+        bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count);
 
         if (!res) {
             if (seq_id == -1) {
-                ctx->kv_self.clear();
+                kv.clear();
             } else {
-                ctx->kv_self.seq_rm(seq_id, -1, -1);
+                kv.seq_rm(seq_id, -1, -1);
             }
             throw std::runtime_error("failed to restore kv cache");
         }
@@ -1521,7 +1518,7 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     data_ctx.write_logits(ctx);
     data_ctx.write_embeddings(ctx);
 
-    data_ctx.write_kv_cache(ctx);
+    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams);
 
     return data_ctx.get_size_written();
 }
@@ -1558,7 +1555,7 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
     data_ctx.read_logits(ctx);
     data_ctx.read_embeddings(ctx);
 
-    data_ctx.read_kv_cache(ctx);
+    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams);
 
     return data_ctx.get_size_read();
 }
@@ -1654,7 +1651,7 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session
 static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.write_kv_cache(ctx, seq_id);
+    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id);
 
     return data_ctx.get_size_written();
 }
@@ -1677,7 +1674,7 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_
 static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.read_kv_cache(ctx, dest_seq_id);
+    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id);
 
     return data_ctx.get_size_read();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 97285481e..7fc2fabf5 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -44,6 +44,7 @@ struct llama_kv_cache_slot_info {
 
 // ring-buffer of cached KV data
 // TODO: pimpl
+// TODO: add notion of max sequences
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;

From fd05ab87aad1221535da86d5cd810ee5856ebb49 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 13:13:35 +0200
Subject: [PATCH 07/28] kv_cache : move state read/write to llama_kv_cache

ggml-ci
---
 src/llama-context.cpp  | 424 +++++------------------------------------
 src/llama-kv-cache.cpp | 378 ++++++++++++++++++++++++++++++++++++
 src/llama-kv-cache.h   |  20 ++
 3 files changed, 446 insertions(+), 376 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8fc6de2f2..0e146652c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -908,143 +908,6 @@ struct llama_data_write {
             write(ctx->embd, embeddings_size * sizeof(float));
         }
     }
-
-    void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
-        for (const auto & range : cell_ranges) {
-            for (uint32_t i = range.first; i < range.second; ++i) {
-                const auto & cell = kv_self.cells[i];
-                const llama_pos pos      = cell.pos;
-                const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
-
-                write(&pos,      sizeof(pos));
-                write(&n_seq_id, sizeof(n_seq_id));
-
-                if (n_seq_id) {
-                    for (auto seq_id : cell.seq_id) {
-                        write(&seq_id, sizeof(seq_id));
-                    }
-                }
-            }
-        }
-    }
-
-    void write_kv_cache_data(const llama_kv_cache & kv, const llama_hparams & hparams, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
-        const uint32_t v_trans = kv.v_trans ? 1 : 0;
-        const uint32_t n_layer = hparams.n_layer;
-
-        write(&v_trans, sizeof(v_trans));
-        write(&n_layer, sizeof(n_layer));
-
-        std::vector<uint8_t> tmp_buf;
-
-        // Iterate and write all the keys first, each row is a cell
-        // Get whole range at a time
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-            // Write key type
-            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
-            write(&k_type_i, sizeof(k_type_i));
-
-            // Write row size of key
-            const uint64_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
-            write(&k_size_row, sizeof(k_size_row));
-
-            // Read each range of cells of k_size length each into tmp_buf and write out
-            for (const auto & range : cell_ranges) {
-                const size_t range_size = range.second - range.first;
-                const size_t buf_size = range_size * k_size_row;
-                write_tensor_data(kv.k_l[il], range.first * k_size_row, buf_size);
-            }
-        }
-
-        if (!kv.v_trans) {
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Write value type
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                write(&v_type_i, sizeof(v_type_i));
-
-                // Write row size of value
-                const uint64_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
-                write(&v_size_row, sizeof(v_size_row));
-
-                // Read each range of cells of v_size length each into tmp_buf and write out
-                for (const auto & range : cell_ranges) {
-                    const size_t range_size = range.second - range.first;
-                    const size_t buf_size = range_size * v_size_row;
-                    write_tensor_data(kv.v_l[il], range.first * v_size_row, buf_size);
-                }
-            }
-        } else {
-            // When v is transposed, we also need the element size and get the element ranges from each row
-            const uint32_t kv_size = kv.size;
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Write value type
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                write(&v_type_i, sizeof(v_type_i));
-
-                // Write element size
-                const uint32_t v_size_el = ggml_type_size(kv.v_l[il]->type);
-                write(&v_size_el, sizeof(v_size_el));
-
-                // Write GQA embedding size
-                write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
-
-                // For each row, we get the element values of each cell
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    // Read each range of cells of v_size_el length each into tmp_buf and write out
-                    for (const auto & range : cell_ranges) {
-                        const size_t range_size = range.second - range.first;
-                        const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                        const size_t buf_size = range_size * v_size_el;
-                        write_tensor_data(kv.v_l[il], src_offset, buf_size);
-                    }
-                }
-            }
-        }
-    }
-
-    void write_kv_cache(const llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
-        std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
-        uint32_t cell_count = 0;
-
-        // Count the number of cells with the specified seq_id
-        // Find all the ranges of cells with this seq id (or all, when -1)
-        uint32_t cell_range_begin = kv.size;
-        for (uint32_t i = 0; i < kv.size; ++i) {
-            const auto & cell = kv.cells[i];
-            if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
-                ++cell_count;
-                if (cell_range_begin == kv.size) {
-                    cell_range_begin = i;
-                }
-            } else {
-                if (cell_range_begin != kv.size) {
-                    cell_ranges.emplace_back(cell_range_begin, i);
-                    cell_range_begin = kv.size;
-                }
-            }
-        }
-        if (cell_range_begin != kv.size) {
-            cell_ranges.emplace_back(cell_range_begin, kv.size);
-        }
-
-        // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
-        uint32_t cell_count_check = 0;
-        for (const auto & range : cell_ranges) {
-            cell_count_check += range.second - range.first;
-        }
-        GGML_ASSERT(cell_count == cell_count_check);
-
-        write(&cell_count, sizeof(cell_count));
-
-        write_kv_cache_meta(kv, cell_ranges, seq_id);
-        write_kv_cache_data(kv, hparams, cell_ranges);
-    }
 };
 
 struct llama_data_read {
@@ -1135,241 +998,6 @@ struct llama_data_read {
             read_to(ctx->embd, embeddings_size * sizeof(float));
         }
     }
-
-    bool read_kv_cache_meta(llama_kv_cache & kv, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
-        if (dest_seq_id != -1) {
-            // single sequence
-
-            kv.seq_rm(dest_seq_id, -1, -1);
-
-            llama_sbatch sbatch;
-            llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
-
-            batch.n_tokens = cell_count;
-            batch.n_seq_tokens = cell_count;
-            batch.n_seqs = 1;
-
-            for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_pos pos;
-                uint32_t n_seq_id;
-
-                read_to(&pos,      sizeof(pos));
-                read_to(&n_seq_id, sizeof(n_seq_id));
-
-                if (n_seq_id != 0) {
-                    LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
-                    return false;
-                }
-
-                batch.pos[i] = pos;
-            }
-            batch.n_seq_id[0] = 1;
-            batch.seq_id[0] = &dest_seq_id;
-            if (!kv.find_slot(batch)) {
-                LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
-                return false;
-            }
-
-            // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
-            // Assume that this is one contiguous block of cells
-            GGML_ASSERT(kv.head + cell_count <= kv.size);
-            GGML_ASSERT(kv.cells[kv.head].pos == batch.pos[0]);
-            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-            GGML_ASSERT(kv.cells[kv.head].has_seq_id(dest_seq_id));
-            GGML_ASSERT(kv.cells[kv.head + cell_count - 1].has_seq_id(dest_seq_id));
-        } else {
-            // whole KV cache restore
-
-            if (cell_count > kv.size) {
-                LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
-                return false;
-            }
-
-            kv.clear();
-
-            for (uint32_t i = 0; i < cell_count; ++i) {
-                llama_kv_cell & cell = kv.cells[i];
-
-                llama_pos pos;
-                uint32_t  n_seq_id;
-
-                read_to(&pos,      sizeof(pos));
-                read_to(&n_seq_id, sizeof(n_seq_id));
-
-                cell.pos = pos;
-
-                for (uint32_t j = 0; j < n_seq_id; ++j) {
-                    llama_seq_id seq_id;
-                    read_to(&seq_id, sizeof(seq_id));
-
-                    // TODO: llama_kv_cache should have a notion of max sequences
-                    //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                    if (seq_id < 0) {
-                        //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
-                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
-                        return false;
-                    }
-
-                    cell.seq_id.insert(seq_id);
-
-                    if (kv.recurrent) {
-                        int32_t & tail = kv.cells[seq_id].tail;
-                        if (tail != -1) {
-                            LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
-                            return false;
-                        }
-                        tail = i;
-                    }
-                }
-            }
-
-            kv.head = 0;
-            kv.used = cell_count;
-        }
-
-        if (kv.recurrent) {
-            for (uint32_t i = 0; i < cell_count; ++i) {
-                uint32_t cell_id = kv.head + i;
-                // make sure the recurrent states will keep their restored state
-                kv.cells[cell_id].src = cell_id;
-            }
-        }
-
-        return true;
-    }
-
-    bool read_kv_cache_data(llama_kv_cache & kv, const llama_hparams & hparams, uint32_t cell_count) {
-        uint32_t v_trans;
-        uint32_t n_layer;
-        read_to(&v_trans, sizeof(v_trans));
-        read_to(&n_layer, sizeof(n_layer));
-
-        if (n_layer != hparams.n_layer) {
-            LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
-            return false;
-        }
-        if (cell_count > kv.size) {
-            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv.size);
-            return false;
-        }
-        if (kv.v_trans != (bool) v_trans) {
-            LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
-            return false;
-        }
-
-        // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
-        for (uint32_t il = 0; il < n_layer; ++il) {
-            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-            // Read type of key
-            int32_t k_type_i_ref;
-            read_to(&k_type_i_ref, sizeof(k_type_i_ref));
-            const int32_t k_type_i = (int32_t)kv.k_l[il]->type;
-            if (k_type_i != k_type_i_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
-                return false;
-            }
-
-            // Read row size of key
-            uint64_t k_size_row_ref;
-            read_to(&k_size_row_ref, sizeof(k_size_row_ref));
-            const size_t k_size_row = ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa);
-            if (k_size_row != k_size_row_ref) {
-                LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
-                return false;
-            }
-
-            if (cell_count) {
-                // Read and set the keys for the whole cell range
-                ggml_backend_tensor_set(kv.k_l[il], read(cell_count * k_size_row), kv.head * k_size_row, cell_count * k_size_row);
-            }
-        }
-
-        if (!kv.v_trans) {
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Read type of value
-                int32_t v_type_i_ref;
-                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                if (v_type_i != v_type_i_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                    return false;
-                }
-
-                // Read row size of value
-                uint64_t v_size_row_ref;
-                read_to(&v_size_row_ref, sizeof(v_size_row_ref));
-                const size_t v_size_row = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
-                if (v_size_row != v_size_row_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
-                    return false;
-                }
-
-                if (cell_count) {
-                    // Read and set the values for the whole cell range
-                    ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_row), kv.head * v_size_row, cell_count * v_size_row);
-                }
-            }
-        } else {
-            // For each layer, read the values for each cell (transposed)
-            for (uint32_t il = 0; il < n_layer; ++il) {
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                // Read type of value
-                int32_t v_type_i_ref;
-                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
-                const int32_t v_type_i = (int32_t)kv.v_l[il]->type;
-                if (v_type_i != v_type_i_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                    return false;
-                }
-
-                // Read element size of value
-                uint32_t v_size_el_ref;
-                read_to(&v_size_el_ref, sizeof(v_size_el_ref));
-                const size_t v_size_el = ggml_type_size(kv.v_l[il]->type);
-                if (v_size_el != v_size_el_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
-                    return false;
-                }
-
-                // Read GQA embedding size
-                uint32_t n_embd_v_gqa_ref;
-                read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
-                if (n_embd_v_gqa != n_embd_v_gqa_ref) {
-                    LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
-                    return false;
-                }
-
-                if (cell_count) {
-                    // For each row in the transposed matrix, read the values for the whole cell range
-                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                        const size_t dst_offset = (kv.head + j * kv.size) * v_size_el;
-                        ggml_backend_tensor_set(kv.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
-                    }
-                }
-            }
-        }
-        return true;
-    }
-
-    void read_kv_cache(llama_kv_cache & kv, const llama_hparams & hparams, llama_seq_id seq_id = -1) {
-        uint32_t cell_count;
-        read_to(&cell_count, sizeof(cell_count));
-
-        bool res = read_kv_cache_meta(kv, cell_count, seq_id) && read_kv_cache_data(kv, hparams, cell_count);
-
-        if (!res) {
-            if (seq_id == -1) {
-                kv.clear();
-            } else {
-                kv.seq_rm(seq_id, -1, -1);
-            }
-            throw std::runtime_error("failed to restore kv cache");
-        }
-    }
 };
 
 struct llama_data_write_dummy : llama_data_write {
@@ -1518,7 +1146,18 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     data_ctx.write_logits(ctx);
     data_ctx.write_embeddings(ctx);
 
-    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams);
+    llama_kv_cache::io io = {
+        /* .write =*/ [&](const void * src, size_t size) {
+            data_ctx.write(src, size);
+        },
+        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+            data_ctx.write_tensor_data(tensor, offset, size);
+        },
+        /* .read    =*/ nullptr,
+        /* .read_to =*/ nullptr,
+    };
+
+    ctx->kv_self.state_write(io, ctx->model.hparams);
 
     return data_ctx.get_size_written();
 }
@@ -1555,7 +1194,18 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
     data_ctx.read_logits(ctx);
     data_ctx.read_embeddings(ctx);
 
-    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams);
+    llama_kv_cache::io io = {
+        /* .write =*/ nullptr,
+        /* .write_tensor_data =*/ nullptr,
+        /* .read =*/ [&](size_t size) {
+            return data_ctx.read(size);
+        },
+        /* .read_to =*/ [&](void * dst, size_t size) {
+            data_ctx.read_to(dst, size);
+        },
+    };
+
+    ctx->kv_self.state_read(io, ctx->model.hparams);
 
     return data_ctx.get_size_read();
 }
@@ -1651,7 +1301,18 @@ bool llama_state_save_file(struct llama_context * ctx, const char * path_session
 static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.write_kv_cache(ctx->kv_self, ctx->model.hparams, seq_id);
+    llama_kv_cache::io io = {
+        /* .write =*/ [&](const void * src, size_t size) {
+            data_ctx.write(src, size);
+        },
+        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+            data_ctx.write_tensor_data(tensor, offset, size);
+        },
+        /* .read =*/    nullptr,
+        /* .read_to =*/ nullptr,
+    };
+
+    ctx->kv_self.state_write(io, ctx->model.hparams, seq_id);
 
     return data_ctx.get_size_written();
 }
@@ -1674,7 +1335,18 @@ size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_
 static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
     llama_synchronize(ctx);
 
-    data_ctx.read_kv_cache(ctx->kv_self, ctx->model.hparams, dest_seq_id);
+    llama_kv_cache::io io = {
+        /* .write =*/ nullptr,
+        /* .write_tensor_data =*/ nullptr,
+        /* .read =*/ [&](size_t size) {
+            return data_ctx.read(size);
+        },
+        /* .read_to =*/ [&](void * dst, size_t size) {
+            data_ctx.read_to(dst, size);
+        },
+    };
+
+    ctx->kv_self.state_read(io, ctx->model.hparams, dest_seq_id);
 
     return data_ctx.get_size_read();
 }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 9f3b4e514..6886d24f0 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -8,6 +8,7 @@
 #include <algorithm>
 #include <limits>
 #include <map>
+#include <stdexcept>
 
 static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 
@@ -696,6 +697,383 @@ size_t llama_kv_cache::size_v_bytes() const {
     return size_v_bytes;
 }
 
+void llama_kv_cache::state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        const auto & cell = cells[i];
+        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+            ++cell_count;
+            if (cell_range_begin == size) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != size) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = size;
+            }
+        }
+    }
+    if (cell_range_begin != size) {
+        cell_ranges.emplace_back(cell_range_begin, size);
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges, hparams);
+}
+
+void llama_kv_cache::state_read(const io & io, const llama_hparams & hparams, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, hparams, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear();
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache::state_write_meta(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            const auto & cell = cells[i];
+            const llama_pos pos      = cell.pos;
+            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id) {
+                for (auto seq_id : cell.seq_id) {
+                    io.write(&seq_id, sizeof(seq_id));
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache::state_write_data(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const {
+    const uint32_t v_trans = this->v_trans ? 1 : 0;
+    const uint32_t n_layer = hparams.n_layer;
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)k_l[il]->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor_data(k_l[il], range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor_data(v_l[il], range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = size;
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor_data(v_l[il], src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache::state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_sbatch sbatch;
+        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+
+        batch.n_tokens = cell_count;
+        batch.n_seq_tokens = cell_count;
+        batch.n_seqs = 1;
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 0) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            batch.pos[i] = pos;
+        }
+        batch.n_seq_id[0] = 1;
+        batch.seq_id[0] = &dest_seq_id;
+        if (!find_slot(batch)) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+
+        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head + cell_count <= size);
+        GGML_ASSERT(cells[head].pos == batch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear();
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_kv_cell & cell = cells[i];
+
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cell.pos = pos;
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                // TODO: llama_kv_cache should have a notion of max sequences
+                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                if (seq_id < 0) {
+                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                    return false;
+                }
+
+                cell.seq_id.insert(seq_id);
+
+                if (recurrent) {
+                    int32_t & tail = cells[seq_id].tail;
+                    if (tail != -1) {
+                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                        return false;
+                    }
+                    tail = i;
+                }
+            }
+        }
+
+        head = 0;
+        used = cell_count;
+    }
+
+    if (recurrent) {
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            uint32_t cell_id = head + i;
+            // make sure the recurrent states will keep their restored state
+            cells[cell_id].src = cell_id;
+        }
+    }
+
+    return true;
+}
+
+bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != hparams.n_layer) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+        return false;
+    }
+    if (cell_count > size) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+        return false;
+    }
+    if (v_trans != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) k_l[il]->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(v_l[il]->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * size) * v_size_el;
+                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+/////////////
+
 void llama_kv_cache_clear(llama_kv_cache * kv) {
     kv->clear();
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 7fc2fabf5..0384a2b7c 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -6,8 +6,10 @@
 
 #include <set>
 #include <vector>
+#include <functional>
 
 struct llama_cparams;
+struct llama_hparams;
 struct llama_ubatch;
 
 struct llama_kv_cell {
@@ -45,6 +47,7 @@ struct llama_kv_cache_slot_info {
 // ring-buffer of cached KV data
 // TODO: pimpl
 // TODO: add notion of max sequences
+// TODO: add llama_hparams &
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
@@ -111,12 +114,29 @@ struct llama_kv_cache {
     size_t size_k_bytes() const;
     size_t size_v_bytes() const;
 
+    struct io {
+        std::function<void(const void * src, size_t size)> write;
+        std::function<void(const struct ggml_tensor * tensor, size_t offset, size_t size)> write_tensor_data;
+
+        std::function<const uint8_t * (size_t size)> read;
+        std::function<void(void * dst, size_t size)> read_to;
+    };
+
+    void state_write(const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const;
+    void state_read (const io & io, const llama_hparams & hparams, llama_seq_id seq_id = -1);
+
 private:
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
 
     std::vector<ggml_context_ptr> ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
+
+    void state_write_meta(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(const io & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const;
+
+    bool state_read_meta(const io & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(const io & io, const llama_hparams & hparams, uint32_t cell_count);
 };
 
 //

From 17b363afd3575f8f9d025a35d2abb75f528a64c2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 14 Jan 2025 16:47:34 +0200
Subject: [PATCH 08/28] llama : update llama_kv_self API

ggml-ci
---
 common/common.cpp                             |   6 +-
 common/speculative.cpp                        |  10 +-
 examples/batched-bench/batched-bench.cpp      |   6 +-
 examples/batched.swift/Sources/main.swift     |   2 +-
 .../cvector-generator/cvector-generator.cpp   |   3 +-
 examples/embedding/embedding.cpp              |   5 +-
 examples/gritlm/gritlm.cpp                    |   8 +-
 examples/imatrix/imatrix.cpp                  |   4 +-
 examples/infill/infill.cpp                    |   6 +-
 examples/llama-bench/llama-bench.cpp          |   6 +-
 .../llama/src/main/cpp/llama-android.cpp      |   8 +-
 .../llama.cpp.swift/LibLlama.swift            |   8 +-
 examples/lookahead/lookahead.cpp              |  13 +-
 examples/lookup/lookup.cpp                    |   3 +-
 examples/main/main.cpp                        |  14 +-
 examples/parallel/parallel.cpp                |  11 +-
 examples/passkey/passkey.cpp                  |  30 ++--
 examples/perplexity/perplexity.cpp            |  24 +--
 examples/retrieval/retrieval.cpp              |   4 +-
 examples/run/run.cpp                          |   7 +-
 examples/save-load-state/save-load-state.cpp  |   4 +-
 examples/server/server.cpp                    |  25 ++-
 examples/simple-chat/simple-chat.cpp          |   6 +-
 .../speculative-simple/speculative-simple.cpp |   4 +-
 examples/speculative/speculative.cpp          |  29 ++--
 include/llama.h                               | 105 ++++++++++---
 src/llama-context.cpp                         |  34 ++--
 src/llama-kv-cache.cpp                        |  20 +--
 src/llama-kv-cache.h                          |  42 +++++
 src/llama.cpp                                 | 145 +++++++++++++++++-
 30 files changed, 387 insertions(+), 205 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 29de45189..098feebee 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -952,9 +952,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(lctx);
-
-    if (params.ctx_shift && !llama_kv_cache_can_shift(kv)) {
+    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
         LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
         params.ctx_shift = false;
     }
@@ -1059,7 +1057,7 @@ struct common_init_result common_init_from_params(common_params & params) {
         if (llama_model_has_decoder(model)) {
             llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
         }
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(lctx);
         llama_synchronize(lctx);
         llama_perf_context_reset(lctx);
     }
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 6ac058517..a660f198a 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -171,10 +171,8 @@ llama_tokens common_speculative_gen_draft(
     llama_tokens result;
     result.reserve(params.n_draft);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     if (reuse_n == 0) {
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         prompt.clear();
     } else {
@@ -193,14 +191,14 @@ llama_tokens common_speculative_gen_draft(
         }
 
         if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (kv, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(kv, 0, reuse_i, -1, -reuse_i);
+            llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
 
             prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
         }
 
         if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (kv, 0, reuse_n, -1);
+            llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
 
             prompt.erase(prompt.begin() + reuse_n, prompt.end());
         }
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index fcbad37bb..430e8be51 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -57,8 +57,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const int32_t n_kv_max = llama_n_ctx(ctx);
 
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
@@ -134,7 +132,7 @@ int main(int argc, char ** argv) {
 
                 const auto t_pp_start = ggml_time_us();
 
-                llama_kv_cache_clear(kv);
+                llama_kv_self_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                     LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -143,7 +141,7 @@ int main(int argc, char ** argv) {
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
+                        llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
                     }
                 }
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 371917b2e..a6494ebdf 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -111,7 +111,7 @@ if llama_decode(context, batch) != 0 {
 }
 
 for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
 }
 
 if n_parallel > 1 {
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index adb4a60ad..3733e32d7 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -342,8 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
         fprintf(stderr, "%s : failed to eval\n", __func__);
         return false;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index fda0949f1..c4fb1c6d1 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -34,11 +34,10 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const llama_model * model = llama_get_model(ctx);
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
+    const struct llama_model * model = llama_get_model(ctx);
 
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 16437453e..f7db7861c 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -13,8 +13,6 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
@@ -47,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
         llama_set_embeddings(ctx, true);
         llama_set_causal_attn(ctx, false);
 
@@ -102,11 +100,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     llama_token eos_token = llama_vocab_eos(vocab);
 
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
     llama_set_embeddings(ctx, false);
     llama_set_causal_attn(ctx, true);
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 5efe4f019..e335ecc74 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -431,8 +431,6 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -499,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index de8e77695..4e2f7b727 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -139,8 +139,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
     const int n_ctx_train = llama_model_n_ctx_train(model);
@@ -334,8 +332,8 @@ int main(int argc, char ** argv) {
                 LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                llama_kv_cache_seq_rm (kv, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 8843c0048..fc58135fe 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1575,11 +1575,9 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
-        llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // cool off before the test
         if (params.delay) {
@@ -1619,7 +1617,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_clear(kv);
+            llama_kv_self_clear(ctx);
 
             uint64_t t_start = get_time_ns();
 
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 2a73983a9..cf5e14907 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
         }
 
         batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
 
         const auto t_pp_start = ggml_time_us();
         if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         LOGi("Benchmark text generation (tg)");
 
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
         const auto t_tg_start = ggml_time_us();
         for (i = 0; i < tg; i++) {
 
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
 
         const auto t_tg_end = ggml_time_us();
 
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
 
         const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
         const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
+    llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
 }
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 477c3e6f2..82c26935b 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -208,7 +208,7 @@ actor LlamaContext {
             }
             batch.logits[Int(batch.n_tokens) - 1] = 1 // true
 
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
 
             let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -221,7 +221,7 @@ actor LlamaContext {
 
             // bench text generation
 
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
 
             let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
 
@@ -240,7 +240,7 @@ actor LlamaContext {
 
             let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
 
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
 
             let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
             let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@@ -290,7 +290,7 @@ actor LlamaContext {
     func clear() {
         tokens_list.removeAll()
         temporary_invalid_cchars.removeAll()
-        llama_kv_cache_clear(context)
+        llama_kv_self_clear(context)
     }
 
     private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 1219c2074..b7f334007 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -60,7 +60,6 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -96,7 +95,7 @@ int main(int argc, char ** argv) {
     llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 
     for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
+        llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
     }
 
     const auto t_enc_end = ggml_time_us();
@@ -438,17 +437,17 @@ int main(int argc, char ** argv) {
 
         // KV cache management
         // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(kv, -1, n_past, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_past, -1);
 
         if (seq_id_best != 0) {
             // if a verification token matched, we keep the best sequence and remove the rest
             // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(kv, seq_id_best);
-            llama_kv_cache_seq_cp  (kv, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (kv, seq_id_best,    -1, -1);
+            llama_kv_self_seq_keep(ctx, seq_id_best);
+            llama_kv_self_seq_cp  (ctx, seq_id_best, 0, -1, -1);
+            llama_kv_self_seq_rm  (ctx, seq_id_best,    -1, -1);
 
             for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(kv, 0, s, -1, -1);
+                llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
             }
         }
     }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 8628f7318..4ae93b2a5 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -35,7 +35,6 @@ int main(int argc, char ** argv){
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -193,7 +192,7 @@ int main(int argc, char ** argv){
 
         // KV cache management
         // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(kv, 0, n_past, -1);
+        llama_kv_self_seq_rm(ctx, 0, n_past, -1);
 
         common_batch_clear(batch_tgt);
         common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 9d79af79e..23437937c 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -157,8 +157,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const llama_vocab * vocab = llama_model_get_vocab(model);
     auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
 
@@ -330,7 +328,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(kv, -1, n_matching_session_tokens, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -571,8 +569,8 @@ int main(int argc, char ** argv) {
                     LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                    llama_kv_cache_seq_rm (kv, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(kv, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
 
@@ -595,9 +593,9 @@ int main(int argc, char ** argv) {
                     LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
-                    llama_kv_cache_seq_add(kv, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(kv, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(kv, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
 
                     n_past -= bd;
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 2ba0706dc..3f9e1bcbb 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -134,7 +134,6 @@ int main(int argc, char ** argv) {
 
     llama_model * model = llama_init.model.get();
     llama_context * ctx = llama_init.context.get();
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
 
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
@@ -202,7 +201,7 @@ int main(int argc, char ** argv) {
 
         // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
+            llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
         }
 
         LOG_INF("\n");
@@ -234,9 +233,9 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_cache_seq_rm(kv, i, -1, -1);
+                llama_kv_self_seq_rm(ctx, i, -1, -1);
                 // but keep the system prompt
-                llama_kv_cache_seq_cp(kv, 0, i, -1, -1);
+                llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
             }
 
             LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -372,8 +371,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(kv,    client.id + 1, -1, -1);
-                    llama_kv_cache_seq_cp(kv, 0, client.id + 1, -1, -1);
+                    llama_kv_self_seq_rm(ctx,    client.id + 1, -1, -1);
+                    llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index e2764313b..46de2c2a2 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -86,8 +86,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     auto sparams = llama_sampler_chain_default_params();
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
@@ -134,11 +132,11 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
 
-            llama_kv_cache_seq_add(kv, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div(kv, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_update_kv_cache (ctx, kv);
+            llama_kv_self_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_kv_self_update  (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
 
         common_batch_clear(batch);
@@ -168,12 +166,12 @@ int main(int argc, char ** argv) {
 
         LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
-        llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (kv);
-        llama_update_kv_cache (ctx, kv);
+        llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+        llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+      //llama_kv_self_defrag (ctx);
+        llama_kv_self_update (ctx);
 
-        n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
+        n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
 
         common_batch_clear(batch);
 
@@ -199,12 +197,12 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
-            llama_kv_cache_seq_rm (kv, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(kv, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (kv);
-            llama_update_kv_cache (ctx, kv);
+            llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+            llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+          //llama_kv_self_defrag (ctx);
+            llama_kv_self_update (ctx);
 
-            n_past = llama_kv_cache_seq_pos_max(kv, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
     }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 6c9f716ed..31c436f13 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -299,8 +299,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -362,7 +360,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
@@ -452,8 +450,6 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     const bool add_bos = llama_vocab_get_add_bos(vocab);
     GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 
@@ -550,7 +546,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -745,8 +741,6 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     // Calculates hellaswag score (acc_norm) from prompt
     //
     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -929,7 +923,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             return;
         }
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1090,8 +1084,6 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     constexpr int k_min_trailing_ctx = 3;
 
     auto data = load_winogrande_from_csv(params.prompt);
@@ -1210,7 +1202,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             return;
         }
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1396,8 +1388,6 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     std::istringstream strstream(params.prompt);
     uint32_t n_task;
     strstream.read((char *)&n_task, sizeof(n_task));
@@ -1584,7 +1574,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             return;
         }
 
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1681,8 +1671,6 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
     const llama_model * model = llama_get_model(ctx);
     const llama_vocab * vocab = llama_model_get_vocab(model);
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     if (params.logits_file.empty()) {
         LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
@@ -1776,7 +1764,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
 
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index a907ea076..0efe20d4b 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -82,10 +82,8 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 }
 
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
-    llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(kv);
+    llama_kv_self_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 8e2c174a9..2c38d1ef6 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -756,8 +756,7 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                            std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const llama_kv_cache * kv = llama_get_kv_cache(llama_data.context.get());
-    const bool is_first = llama_kv_cache_used_cells(kv) == 0;
+    const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
 
     const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
     prompt_tokens.resize(n_prompt_tokens);
@@ -772,10 +771,8 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
-    llama_kv_cache * kv = llama_get_kv_cache(ctx.get());
-
     const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_kv_cache_used_cells(kv);
+    const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
     if (n_ctx_used + batch.n_tokens > n_ctx) {
         printf("\033[0m\n");
         printe("context size exceeded\n");
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 3839fbe8c..77b1572a9 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -156,8 +156,6 @@ int main(int argc, char ** argv) {
     // make new context
     llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
 
-    llama_kv_cache * kv3 = llama_get_kv_cache(ctx3);
-
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
     llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
@@ -198,7 +196,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
 
         // erase whole kv
-        llama_kv_cache_clear(kv3);
+        llama_kv_self_clear(ctx3);
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 076044d39..b665bde41 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1693,7 +1693,6 @@ struct server_context {
 
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
-    llama_kv_cache * kv = nullptr;
 
     const llama_vocab * vocab = nullptr;
 
@@ -1756,8 +1755,6 @@ struct server_context {
             return false;
         }
 
-        kv = llama_get_kv_cache(ctx);
-
         vocab = llama_model_get_vocab(model);
 
         n_ctx = llama_n_ctx(ctx);
@@ -2026,7 +2023,7 @@ struct server_context {
         SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
-        llama_kv_cache_clear(kv);
+        llama_kv_self_clear(ctx);
         clean_kv_cache = false;
     }
 
@@ -2568,8 +2565,8 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_kv_cache_n_tokens(kv);
-                    res->kv_cache_used_cells   = llama_kv_cache_used_cells(kv);
+                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
+                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
 
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@@ -2685,7 +2682,7 @@ struct server_context {
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(kv, slot->id, -1, -1);
+                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
                     auto res = std::make_unique<server_task_result_slot_erase>();
@@ -2753,8 +2750,8 @@ struct server_context {
 
                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
-                llama_kv_cache_seq_rm (kv, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(kv, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
 
                 if (slot.params.cache_prompt) {
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@@ -2941,8 +2938,8 @@ struct server_context {
 
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
-                                            llama_kv_cache_seq_rm (kv, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(kv, slot.id, head_c, -1,     kv_shift);
+                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_self_seq_add(ctx, slot.id, head_c, -1,     kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@@ -2980,9 +2977,9 @@ struct server_context {
                     }
 
                     // keep only the common part
-                    if (!llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                         // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(kv, slot.id, -1, -1);
+                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
 
                         // there is no common part left
                         slot.n_past = 0;
@@ -3222,7 +3219,7 @@ struct server_context {
                 slot.cache_tokens.push_back(id);
                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
 
-                llama_kv_cache_seq_rm(kv, slot.id, slot.n_past, -1);
+                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
 
                 for (size_t i = 0; i < ids.size(); ++i) {
                     completion_token_output result;
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index 130e326b5..84f415973 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -88,8 +88,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const llama_kv_cache * kv = llama_get_kv_cache(ctx);
-
     // initialize the sampler
     llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
     llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
@@ -100,7 +98,7 @@ int main(int argc, char ** argv) {
     auto generate = [&](const std::string & prompt) {
         std::string response;
 
-        const bool is_first = llama_kv_cache_used_cells(kv) == 0;
+        const bool is_first = llama_kv_self_used_cells(ctx) == 0;
 
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -115,7 +113,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_kv_cache_used_cells(kv);
+            int n_ctx_used = llama_kv_self_used_cells(ctx);
             if (n_ctx_used + batch.n_tokens > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
index 24bdc806d..a5d2bc9d0 100644
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -45,8 +45,6 @@ int main(int argc, char ** argv) {
     model_tgt = llama_init_tgt.model.get();
     ctx_tgt   = llama_init_tgt.context.get();
 
-    llama_kv_cache * kv = llama_get_kv_cache(ctx_tgt);
-
     const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
 
     // load the draft model
@@ -219,7 +217,7 @@ int main(int argc, char ** argv) {
         {
             LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
 
-            llama_kv_cache_seq_rm(kv, 0, n_past, -1);
+            llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
         }
 
         if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index b4e5259b5..bfddc67e0 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -90,9 +90,6 @@ int main(int argc, char ** argv) {
     model_dft = llama_init_dft.model.get();
     ctx_dft   = llama_init_dft.context.get();
 
-    llama_kv_cache * kv_tgt = llama_get_kv_cache(ctx_tgt);
-    llama_kv_cache * kv_dft = llama_get_kv_cache(ctx_dft);
-
     const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
     const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
 
@@ -423,14 +420,14 @@ int main(int argc, char ** argv) {
             {
                 LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
-                llama_kv_cache_seq_keep(kv_dft, s_keep);
-                llama_kv_cache_seq_cp  (kv_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(kv_dft, 0);
+                llama_kv_self_seq_keep(ctx_dft, s_keep);
+                llama_kv_self_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+                llama_kv_self_seq_keep(ctx_dft, 0);
 
-                llama_kv_cache_seq_rm  (kv_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(kv_tgt, s_keep);
-                llama_kv_cache_seq_cp  (kv_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(kv_tgt, 0);
+                llama_kv_self_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_self_seq_keep(ctx_tgt, s_keep);
+                llama_kv_self_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
+                llama_kv_self_seq_keep(ctx_tgt, 0);
             }
 
             for (int s = 0; s < n_seq_dft; ++s) {
@@ -447,8 +444,8 @@ int main(int argc, char ** argv) {
             common_batch_clear(batch_dft);
             common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
-            llama_kv_cache_seq_rm(kv_dft, 0, n_past_dft, -1);
-            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(kv_dft, batch_dft).c_str());
+            llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
             llama_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
@@ -506,8 +503,8 @@ int main(int argc, char ** argv) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                         LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
-                        llama_kv_cache_seq_rm(kv_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(kv_dft, s, n_seq_cur, -1, -1);
+                        llama_kv_self_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
+                        llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
 
                         // all previous tokens from this branch are now also part of the new branch
                         for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@@ -588,9 +585,9 @@ int main(int argc, char ** argv) {
 
         // evaluate the target model on the drafted tokens
         {
-            llama_kv_cache_seq_keep(kv_tgt, 0);
+            llama_kv_self_seq_keep(ctx_tgt, 0);
             for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(kv_tgt, 0, s, -1, -1);
+                llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
             // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
diff --git a/include/llama.h b/include/llama.h
index 08b8658ad..91300b1ae 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -469,7 +469,7 @@ extern "C" {
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx); // TODO: remove const?
-    LLAMA_API    struct llama_kv_cache * llama_get_kv_cache(      struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
     LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx);
 
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
@@ -641,28 +641,28 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_kv_cache_n_tokens(const struct llama_kv_cache * kv);
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
 
     DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "use llama_kv_cache_n_tokens instead");
+            "use llama_kv_self_n_tokens instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_kv_cache_used_cells(const struct llama_kv_cache * kv);
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
 
     DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
-            "use llama_kv_cache_used_cells instead");
+            "use llama_kv_self_used_cells instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
-            struct llama_kv_cache * kv);
+    LLAMA_API void llama_kv_self_clear(
+            struct llama_context * ctx);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
-            struct llama_kv_cache * kv,
+    LLAMA_API bool llama_kv_self_seq_rm(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id,
                         llama_pos   p0,
                         llama_pos   p1);
@@ -671,26 +671,26 @@ extern "C" {
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_cp(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id_src,
                      llama_seq_id   seq_id_dst,
                         llama_pos   p0,
                         llama_pos   p1);
 
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_keep(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_add(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id,
                         llama_pos   p0,
                         llama_pos   p1,
@@ -699,32 +699,87 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_kv_cache * kv,
+    LLAMA_API void llama_kv_self_seq_div(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id,
                         llama_pos   p0,
                         llama_pos   p1,
                               int   d);
 
     // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
-            struct llama_kv_cache * kv,
+    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
+            struct llama_context * ctx,
                      llama_seq_id   seq_id);
 
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_kv_cache * kv);
+    //   - explicitly with llama_kv_self_update()
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
 
     // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(const struct llama_kv_cache * kv);
+    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
 
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_update_kv_cache(struct llama_context * ctx, struct llama_kv_cache * kv);
+    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
+            struct llama_context * ctx),
+            "use llama_kv_self_clear instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_rm instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+            "use llama_kv_self_seq_cp instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_keep instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+            "use llama_kv_self_seq_add instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+            "use llama_kv_self_seq_div instead");
+
+    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+            "use llama_kv_self_seq_pos_max instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
+            "use llama_kv_self_defrag instead");
+
+    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
+            "use llama_kv_self_can_shift instead");
+
+    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
+            "use llama_kv_self_update instead");
+
 
     //
     // State / sessions
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0e146652c..0004e214b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -606,7 +606,7 @@ const llama_model * llama_get_model(const llama_context * ctx) {
     return &ctx->model;
 }
 
-llama_kv_cache * llama_get_kv_cache(llama_context * ctx) {
+llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
     return &ctx->kv_self;
 }
 
@@ -1147,14 +1147,14 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
     data_ctx.write_embeddings(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ [&](const void * src, size_t size) {
+        /* .write = */ [&](const void * src, size_t size) {
             data_ctx.write(src, size);
         },
-        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
             data_ctx.write_tensor_data(tensor, offset, size);
         },
-        /* .read    =*/ nullptr,
-        /* .read_to =*/ nullptr,
+        /* .read    = */ nullptr,
+        /* .read_to = */ nullptr,
     };
 
     ctx->kv_self.state_write(io, ctx->model.hparams);
@@ -1195,12 +1195,12 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
     data_ctx.read_embeddings(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ nullptr,
-        /* .write_tensor_data =*/ nullptr,
-        /* .read =*/ [&](size_t size) {
+        /* .write = */ nullptr,
+        /* .write_tensor_data = */ nullptr,
+        /* .read = */ [&](size_t size) {
             return data_ctx.read(size);
         },
-        /* .read_to =*/ [&](void * dst, size_t size) {
+        /* .read_to = */ [&](void * dst, size_t size) {
             data_ctx.read_to(dst, size);
         },
     };
@@ -1302,14 +1302,14 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
     llama_synchronize(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ [&](const void * src, size_t size) {
+        /* .write = */ [&](const void * src, size_t size) {
             data_ctx.write(src, size);
         },
-        /* .write_tensor_data =*/ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
+        /* .write_tensor_data = */ [&](const struct ggml_tensor * tensor, size_t offset, size_t size) {
             data_ctx.write_tensor_data(tensor, offset, size);
         },
-        /* .read =*/    nullptr,
-        /* .read_to =*/ nullptr,
+        /* .read = */    nullptr,
+        /* .read_to = */ nullptr,
     };
 
     ctx->kv_self.state_write(io, ctx->model.hparams, seq_id);
@@ -1336,12 +1336,12 @@ static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llam
     llama_synchronize(ctx);
 
     llama_kv_cache::io io = {
-        /* .write =*/ nullptr,
-        /* .write_tensor_data =*/ nullptr,
-        /* .read =*/ [&](size_t size) {
+        /* .write = */ nullptr,
+        /* .write_tensor_data = */ nullptr,
+        /* .read = */ [&](size_t size) {
             return data_ctx.read(size);
         },
-        /* .read_to =*/ [&](void * dst, size_t size) {
+        /* .read_to = */ [&](void * dst, size_t size) {
             data_ctx.read_to(dst, size);
         },
     };
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 6886d24f0..d2b81a022 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1072,7 +1072,17 @@ bool llama_kv_cache::state_read_data(const io & io, const llama_hparams & hparam
     return true;
 }
 
-/////////////
+//
+// interface implementation
+//
+
+int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
+    return kv->n_tokens();
+}
+
+int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
+    return kv->used;
+}
 
 void llama_kv_cache_clear(llama_kv_cache * kv) {
     kv->clear();
@@ -1125,14 +1135,6 @@ void llama_kv_cache_defrag(llama_kv_cache * kv) {
     kv->defrag();
 }
 
-int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv) {
-    return kv->n_tokens();
-}
-
-int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv) {
-    return kv->used;
-}
-
 bool llama_kv_cache_can_shift(const llama_kv_cache * kv) {
     return kv->can_shift;
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0384a2b7c..2e021d4ed 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -190,6 +190,48 @@ struct llama_kv_slot_restorer {
     }
 };
 
+// TODO: maybe become part of the public llama_kv_cache in the future
+int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
+
+int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv);
+
+void llama_kv_cache_clear(llama_kv_cache * kv);
+
+bool llama_kv_cache_seq_rm(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1);
+
+void llama_kv_cache_seq_cp(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id_src,
+          llama_seq_id   seq_id_dst,
+             llama_pos   p0,
+             llama_pos   p1);
+
+void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id);
+
+void llama_kv_cache_seq_add(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+             llama_pos   delta);
+
+void llama_kv_cache_seq_div(
+        llama_kv_cache * kv,
+          llama_seq_id   seq_id,
+             llama_pos   p0,
+             llama_pos   p1,
+                   int   d);
+
+llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id);
+
+void llama_kv_cache_defrag(llama_kv_cache * kv);
+
+bool llama_kv_cache_can_shift(const llama_kv_cache * kv);
+
 //
 // kv cache view
 //
diff --git a/src/llama.cpp b/src/llama.cpp
index 0227ba6b3..b8f404375 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8564,7 +8564,7 @@ static int llama_decode_impl(
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_update_kv_cache(&lctx, &lctx.kv_self); // TODO: lctx->update_kv_cache()
+            llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update()
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -9182,9 +9182,12 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
     //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
 }
 
-static void llama_update_kv_cache_impl(llama_context & lctx, llama_kv_cache & kv) {
+// TODO: move to llama_context
+static void llama_kv_self_update_impl(llama_context & lctx) {
     bool need_reserve = false;
 
+    auto & kv = lctx.kv_self;
+
     if (kv.has_shift) {
         if (!kv.can_shift) {
             GGML_ABORT("The current context does not support K-shift");
@@ -9856,17 +9859,151 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view *
 
 // deprecated
 int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_self_n_tokens(ctx);
+}
+
+int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
     return llama_kv_cache_n_tokens(&ctx->kv_self);
 }
 
 // deprecated
 int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_self_used_cells(ctx);
+}
+
+int32_t llama_kv_self_used_cells(const llama_context * ctx) {
     return llama_kv_cache_used_cells(&ctx->kv_self);
 }
 
+// deprecated
+void llama_kv_cache_clear(llama_context * ctx) {
+    llama_kv_self_clear(ctx);
+}
+
+void llama_kv_self_clear(llama_context * ctx) {
+    llama_kv_cache_clear(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
+}
+
+bool llama_kv_self_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_self_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_keep(
+        llama_context * ctx,
+         llama_seq_id   seq_id) {
+    return llama_kv_self_seq_keep(ctx, seq_id);
+}
+
+void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
+}
+
+void llama_kv_self_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta);
+}
+
+// deprecated
+void llama_kv_cache_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+}
+
+void llama_kv_self_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d);
+}
+
+// deprecated
+llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_self_seq_pos_max(ctx, seq_id);
+}
+
+llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_defrag(llama_context * ctx) {
+    return llama_kv_self_defrag(ctx);
+}
+
+void llama_kv_self_defrag(llama_context * ctx) {
+    return llama_kv_cache_defrag(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_can_shift(const llama_context * ctx) {
+    return llama_kv_self_can_shift(ctx);
+}
+
+bool llama_kv_self_can_shift(const llama_context * ctx) {
+    return llama_kv_cache_can_shift(&ctx->kv_self);
+}
+
+// deprecated
+void llama_kv_cache_update(llama_context * ctx) {
+    llama_kv_self_update(ctx);
+}
+
 // TODO: move to llama-context
-void llama_update_kv_cache(llama_context * ctx, llama_kv_cache * kv) {
-    llama_update_kv_cache_impl(*ctx, *kv);
+void llama_kv_self_update(llama_context * ctx) {
+    llama_kv_self_update_impl(*ctx);
 }
 
 ///

From a19f671fe078497f73ec1898951475e026ffdc20 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 15 Jan 2025 10:54:21 +0200
Subject: [PATCH 09/28] context : minor

ggml-ci
---
 src/llama-context.cpp  | 36 +++++++++++-------------------------
 src/llama-context.h    |  8 +++-----
 src/llama-kv-cache.cpp |  1 +
 src/llama-kv-cache.h   |  6 +++---
 src/llama.cpp          | 33 +++++++++++++++++++--------------
 5 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0004e214b..9eae6fe57 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -8,30 +8,6 @@
 #include <cstring>
 #include <stdexcept>
 
-void llama_set_k_shift(struct llama_context & lctx) {
-    const int64_t kv_size = lctx.kv_self.size;
-
-    assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
-
-    int32_t * data = (int32_t *) lctx.inp_K_shift->data;
-
-    for (int i = 0; i < kv_size; ++i) {
-        data[i] = lctx.kv_self.cells[i].delta;
-    }
-}
-
-void llama_set_s_copy(struct llama_context & lctx) {
-    const int64_t kv_size = lctx.kv_self.size;
-
-    assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
-
-    int32_t * data = (int32_t *) lctx.inp_s_copy->data;
-
-    for (int i = 0; i < kv_size; ++i) {
-        data[i] = lctx.kv_self.cells[i].src;
-    }
-}
-
 // llama input
 
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
@@ -58,6 +34,16 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+void llama_context::set_k_shift(llama_kv_cache & kv) {
+    assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+
+    int32_t * data = (int32_t *) inp_K_shift->data;
+
+    for (uint32_t i = 0; i < kv.size; ++i) {
+        data[i] = kv.cells[i].delta;
+    }
+}
+
 void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     //
     // set input data
@@ -134,7 +120,6 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
 
-
             float * data     = nullptr;
             float * data_swa = nullptr;
 
@@ -599,6 +584,7 @@ uint32_t llama_n_ubatch(const struct llama_context * ctx) {
 }
 
 uint32_t llama_n_seq_max(const struct llama_context * ctx) {
+    // TODO: add notion of n_seq_max to llama_kv_cache and use it here
     return ctx->kv_self.size;
 }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index a9268b292..73baa711f 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -18,7 +18,7 @@ struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
         , t_start_us(model.t_start_us)
-        , t_load_us(model.t_load_us) {}
+        , t_load_us (model.t_load_us) {}
 
     const struct llama_model & model;
 
@@ -107,13 +107,11 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+
+    void set_k_shift(llama_kv_cache & kv);
 };
 
 // TODO: make these methods of llama_context
-void llama_set_k_shift(struct llama_context & lctx);
-
-void llama_set_s_copy(struct llama_context & lctx);
-
 void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
 
 // Make sure enough space is available for outputs.
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index d2b81a022..b79c2ff93 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -6,6 +6,7 @@
 #include "llama-model.h"
 
 #include <algorithm>
+#include <cassert>
 #include <limits>
 #include <map>
 #include <stdexcept>
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 2e021d4ed..5ffee6281 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -88,11 +88,11 @@ struct llama_kv_cache {
 
     void clear();
 
-    bool seq_rm  (llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1);
     void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
     void seq_keep(llama_seq_id seq_id);
-    void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
-    void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d);
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta);
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d);
 
     llama_pos seq_pos_max(llama_seq_id seq_id);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index b8f404375..3e1cd8260 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1142,18 +1142,18 @@ struct llm_build_context {
 
         ctx0 = ggml_init(params);
 
-        lctx.inp_tokens      = nullptr;
-        lctx.inp_embd        = nullptr;
-        lctx.inp_pos         = nullptr;
-        lctx.inp_out_ids     = nullptr;
-        lctx.inp_KQ_mask     = nullptr;
-        lctx.inp_KQ_mask_swa = nullptr;
-        lctx.inp_K_shift     = nullptr;
-        lctx.inp_mean        = nullptr;
-        lctx.inp_cls         = nullptr;
-        lctx.inp_s_copy      = nullptr;
-        lctx.inp_s_mask      = nullptr;
-        lctx.inp_s_seq       = nullptr;
+        lctx.inp_tokens        = nullptr;
+        lctx.inp_embd          = nullptr;
+        lctx.inp_pos           = nullptr;
+        lctx.inp_out_ids       = nullptr;
+        lctx.inp_KQ_mask       = nullptr;
+        lctx.inp_KQ_mask_swa   = nullptr;
+        lctx.inp_K_shift       = nullptr;
+        lctx.inp_mean          = nullptr;
+        lctx.inp_cls           = nullptr;
+        lctx.inp_s_copy        = nullptr;
+        lctx.inp_s_mask        = nullptr;
+        lctx.inp_s_seq         = nullptr;
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@@ -1174,9 +1174,11 @@ struct llm_build_context {
         ggml_set_input(lctx.inp_K_shift);
 
         for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head_kv = hparams.n_head_kv(il);
+            const int64_t n_head_kv    = hparams.n_head_kv(il);
             const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
             struct ggml_tensor * rope_factors = build_rope_factors(il);
+
             struct ggml_tensor * k =
                 ggml_view_3d(ctx0, kv_self.k_l[il],
                     n_embd_head_k, n_head_kv, n_ctx,
@@ -1189,6 +1191,7 @@ struct llm_build_context {
                 // dequantize to f32 -> RoPE -> quantize back
                 tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
                 cb(tmp, "K_f32", il);
+
                 for (auto & backend : lctx.backends) {
                     // Figure out which backend KV cache belongs to
                     if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
@@ -1200,6 +1203,7 @@ struct llm_build_context {
                         lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(tmp, "K_shifted_f32", il);
+
                 tmp = ggml_cpy(ctx0, tmp, k);
             } else {
                 // we rotate only the first n_rot dimensions
@@ -1208,6 +1212,7 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
             }
             cb(tmp, "K_shifted", il);
+
             ggml_build_forward_expand(gf, tmp);
         }
 
@@ -9201,7 +9206,7 @@ static void llama_kv_self_update_impl(llama_context & lctx) {
 
             ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
 
-            llama_set_k_shift(lctx);
+            lctx.set_k_shift(kv);
 
             llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 

From ae274f9747cce6ba6b4910d05ddc3016cd0b4e21 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 15 Jan 2025 13:35:56 +0200
Subject: [PATCH 10/28] llama : fix names [no ci]

---
 src/llama.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 3e1cd8260..37816ddc2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1373,9 +1373,9 @@ struct llm_build_context {
             inp = ggml_graph_node(gf, i);
             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                 break;
-            } else {
-                inp = nullptr;
             }
+
+            inp = nullptr;
         }
         GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
 
@@ -1431,7 +1431,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_tensor * llm_build_pos_bucket(bool causal) {
+    struct ggml_tensor * build_pos_bucket(bool causal) {
         if (causal) {
             lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
         } else {
@@ -1444,7 +1444,7 @@ struct llm_build_context {
         return lctx.inp_pos_bucket;
     }
 
-    struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+    struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
         struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
         cb(pos_bucket_1d, "pos_bucket_1d", -1);
 
@@ -1463,7 +1463,7 @@ struct llm_build_context {
         return pos_bias;
     }
 
-    struct ggml_tensor * llm_build_inp_embd_enc() {
+    struct ggml_tensor * build_inp_embd_enc() {
         const int64_t n_embd = hparams.n_embd;
         lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
         ggml_set_input(lctx.inp_embd_enc);
@@ -1471,7 +1471,7 @@ struct llm_build_context {
         return lctx.inp_embd_enc;
     }
 
-    struct ggml_tensor * llm_build_inp_KQ_mask_cross() {
+    struct ggml_tensor * build_inp_KQ_mask_cross() {
         lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
         ggml_set_input(lctx.inp_KQ_mask_cross);
         cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
@@ -6775,7 +6775,7 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
         GGML_ASSERT(lctx.is_encoding);
-        struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
+        struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
@@ -6810,7 +6810,7 @@ struct llm_build_context {
                 cb(kq, "kq", il);
 
                 struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
-                struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
+                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
                 struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
                 cb(kq_b, "kq_b", il);
 
@@ -6909,11 +6909,11 @@ struct llm_build_context {
         GGML_ASSERT(!lctx.is_encoding);
         GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
 
-        struct ggml_tensor * embd_enc       = llm_build_inp_embd_enc();
-        struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
+        struct ggml_tensor * embd_enc       = build_inp_embd_enc();
+        struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
 
         struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
-        struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
+        struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6961,7 +6961,7 @@ struct llm_build_context {
                 cb(kq, "kq", il);
 
                 struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
-                struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
+                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
                 struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
                 cb(kq_b, "kq_b", il);
 

From f2524c0e4137a4327473c086f97a01aa0632ca3e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 16 Jan 2025 15:04:14 +0200
Subject: [PATCH 11/28] llama : remove references to llama_kv_cache (wip)

Intermediate step necessary to abstract the `llama_context` and
`llama_kv_cache`.

ggml-ci
---
 src/llama-context.cpp | 1035 +++++++++-
 src/llama-context.h   |  162 +-
 src/llama.cpp         | 4568 +++++++++++++++++------------------------
 3 files changed, 2982 insertions(+), 2783 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9eae6fe57..910e2243d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -8,8 +8,6 @@
 #include <cstring>
 #include <stdexcept>
 
-// llama input
-
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
     // TODO move to hparams if a T5 variant appears that uses a different value
     const int64_t max_distance = 128;
@@ -34,56 +32,88 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
-void llama_context::set_k_shift(llama_kv_cache & kv) {
-    assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
-
-    int32_t * data = (int32_t *) inp_K_shift->data;
-
-    for (uint32_t i = 0; i < kv.size; ++i) {
-        data[i] = kv.cells[i].delta;
-    }
+// TODO: improve
+void llama_context::reset() {
+    inp_tokens          = nullptr;
+    inp_embd            = nullptr;
+    inp_pos             = nullptr;
+    inp_out_ids         = nullptr;
+    inp_mean            = nullptr;
+    inp_cls             = nullptr;
+    inp_embd_enc        = nullptr;
+    inp_pos_bucket      = nullptr;
+    inp_KQ_mask         = nullptr;
+    inp_KQ_mask_cnv     = nullptr;
+    inp_KQ_mask_swa     = nullptr;
+    inp_KQ_mask_swa_cnv = nullptr;
+    inp_KQ_mask_cross   = nullptr;
+    inp_K_shift         = nullptr;
+    inp_s_copy          = nullptr;
+    inp_s_mask          = nullptr;
 }
 
-void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
+void llama_context::prepare_k_shift() {
+}
+
+void llama_context::prepare_defrag() {
+}
+
+void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) {
+}
+
+// llama input
+
+void llama_context::set_inputs(const llama_ubatch & ubatch) {
+    const llama_hparams & hparams = model.hparams;
+
     //
     // set input data
     //
 
-    const auto & hparams = lctx.model.hparams;
-    const auto & cparams = lctx.cparams;
-    const auto & kv_self = lctx.kv_self;
+    if (inp_K_shift) {
+        assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
+
+        int32_t * data = (int32_t *) inp_K_shift->data;
+
+        for (uint32_t i = 0; i < kv_self.size; ++i) {
+            data[i] = kv_self.cells[i].delta;
+        }
+
+        // the K-shift graph requires just this input
+        return;
+    }
 
     if (ubatch.token) {
         const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
+        ggml_backend_tensor_set(inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(inp_tokens));
     }
 
     if (ubatch.embd) {
         const int64_t n_embd   = hparams.n_embd;
         const int64_t n_tokens = ubatch.n_tokens;
 
-        ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+        ggml_backend_tensor_set(inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(inp_embd));
     }
 
-    if (ubatch.pos && lctx.inp_pos) {
+    if (ubatch.pos && inp_pos) {
         const int64_t n_tokens = ubatch.n_tokens;
-        auto n_pos = lctx.n_pos_per_token;
-        ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
+        auto n_pos = n_pos_per_token;
+        ggml_backend_tensor_set(inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(inp_pos));
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
+        //GGML_ASSERT(inp_out_ids && "every model that can must skip unused outputs");
 
-        if (!lctx.inp_out_ids) {
-            LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
+        if (!inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'inp_out_ids' is not created\n", __func__);
         } else {
             const int64_t n_tokens = ubatch.n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
-            int32_t * data = (int32_t *) lctx.inp_out_ids->data;
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_out_ids->buffer));
+            int32_t * data = (int32_t *) inp_out_ids->data;
 
-            if (lctx.n_outputs == n_tokens) {
+            if (n_outputs == n_tokens) {
                 for (int i = 0; i < n_tokens; ++i) {
                     data[i] = i;
                 }
@@ -95,26 +125,26 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
                     }
                 }
                 // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(lctx.n_outputs == n_outputs);
-            } else if (lctx.n_outputs == 1) {
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
                 // only keep last output
                 data[0] = n_tokens - 1;
             } else {
-                GGML_ASSERT(lctx.n_outputs == 0);
+                GGML_ASSERT(n_outputs == 0);
             }
         }
     }
 
     GGML_ASSERT(
-        // (!a || b) is a logical implication (a -> b)
-        // !hparams.causal_attn -> !cparams.causal_attn
-        (hparams.causal_attn || !cparams.causal_attn) &&
-        "causal attention is not supported by this model"
-    );
+            // (!a || b) is a logical implication (a -> b)
+            // !hparams.causal_attn -> !cparams.causal_attn
+            (hparams.causal_attn || !cparams.causal_attn) &&
+            "causal attention is not supported by this model"
+            );
 
-    if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
+    if (inp_KQ_mask || inp_KQ_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
-        if (cparams.causal_attn && !lctx.is_encoding) {
+        if (cparams.causal_attn && !is_encoding) {
             const int64_t n_kv         = kv_self.n;
             const int64_t n_tokens     = ubatch.n_tokens;
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
@@ -123,14 +153,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             float * data     = nullptr;
             float * data_swa = nullptr;
 
-            if (lctx.inp_KQ_mask) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
-                data = (float *) lctx.inp_KQ_mask->data;
+            if (inp_KQ_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer));
+                data = (float *) inp_KQ_mask->data;
             }
 
-            if (lctx.inp_KQ_mask_swa) {
-                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
-                data_swa = (float *) lctx.inp_KQ_mask_swa->data;
+            if (inp_KQ_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_swa->buffer));
+                data_swa = (float *) inp_KQ_mask_swa->data;
             }
 
             // For causal attention, use only the previous KV cells
@@ -191,11 +221,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             const int64_t n_seq_tokens = ubatch.n_seq_tokens;
             const int64_t n_seqs       = ubatch.n_seqs;
             // when using kv cache, the mask needs to match the kv cache size
-            const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
+            const int64_t n_stride = hparams.causal_attn && !is_encoding ? kv_self.n : n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask->buffer));
 
-            float * data = (float *) lctx.inp_KQ_mask->data;
+            float * data = (float *) inp_KQ_mask->data;
 
             for (int h = 0; h < 1; ++h) {
                 for (int s1 = 0; s1 < n_seqs; ++s1) {
@@ -238,11 +268,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(lctx.inp_mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
+        GGML_ASSERT(inp_mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_mean->buffer));
 
-        float * data = (float *) lctx.inp_mean->data;
-        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+        float * data = (float *) inp_mean->data;
+        memset(inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp_mean));
 
         std::vector<uint64_t> sum(n_tokens, 0);
 
@@ -279,11 +309,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(lctx.inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
 
-        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
 
         for (int s = 0; s < n_seqs; ++s) {
             const llama_seq_id seq_id = ubatch.seq_id[s][0];
@@ -306,11 +336,11 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
         const int64_t n_seqs       = ubatch.n_seqs;
 
-        GGML_ASSERT(lctx.inp_cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+        GGML_ASSERT(inp_cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_cls->buffer));
 
-        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
-        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+        uint32_t * data = (uint32_t *) inp_cls->data;
+        memset(inp_cls->data, 0, n_tokens * ggml_element_size(inp_cls));
 
         std::vector<int> last_pos(n_tokens, -1);
         std::vector<int> last_row(n_tokens, -1);
@@ -341,17 +371,18 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     if (kv_self.recurrent) {
         const int64_t n_kv = kv_self.n;
 
-        if (lctx.inp_s_mask) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
-            float * data = (float *) lctx.inp_s_mask->data;
+        if (inp_s_mask) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_mask->buffer));
+            float * data = (float *) inp_s_mask->data;
 
             // clear unused states
             for (int i = 0; i < n_kv; ++i) {
                 const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+                llama_kv_cell & kv_cell = kv_self.cells[cell_id];
 
                 data[i] = (float) (kv_cell.src >= 0);
 
+                // TODO: do not mutate the KV cache
                 // only clear once
                 if (kv_cell.src < 0) {
                     kv_cell.src = cell_id;
@@ -359,14 +390,14 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             }
         }
 
-        if (lctx.inp_s_copy) {
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
-            int32_t * data = (int32_t *) lctx.inp_s_copy->data;
+        if (inp_s_copy) {
+            GGML_ASSERT(ggml_backend_buffer_is_host(inp_s_copy->buffer));
+            int32_t * data = (int32_t *) inp_s_copy->data;
 
             // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
             for (uint32_t i = 0; i < n_kv; ++i) {
                 const uint32_t  cell_id = i + kv_self.head;
-                llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
+                llama_kv_cell & kv_cell = kv_self.cells[cell_id];
 
                 // prevent out-of-bound sources
                 if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
@@ -375,6 +406,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
 
                 data[i] = kv_cell.src;
 
+                // TODO: do not mutate the KV cache
                 // ensure copy only happens once
                 if (kv_cell.src != (int32_t) cell_id) {
                     kv_cell.src = cell_id;
@@ -383,20 +415,20 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
         }
     }
 
-    if (lctx.inp_pos_bucket) {
+    if (inp_pos_bucket) {
         const int64_t n_tokens = ubatch.n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_pos_bucket->buffer));
         GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
-        int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
+        int32_t * data = (int32_t *) inp_pos_bucket->data;
 
-        if (!lctx.is_encoding) {
+        if (!is_encoding) {
             const int64_t n_kv = kv_self.n;
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_kv; ++i) {
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
                     }
                 }
             }
@@ -404,28 +436,28 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
             for (int h = 0; h < 1; ++h) {
                 for (int j = 0; j < n_tokens; ++j) {
                     for (int i = 0; i < n_tokens; ++i) {
-                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, is_encoding);
                     }
                 }
             }
         }
     }
 
-    if (!lctx.is_encoding && lctx.inp_embd_enc) {
-        assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
-        assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
+    if (!is_encoding && inp_embd_enc) {
+        assert(inp_embd_enc->type == GGML_TYPE_F32);
+        assert((size_t) ggml_nelements(inp_embd_enc) == embd_enc.size());
 
-        ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
+        ggml_backend_tensor_set(inp_embd_enc, embd_enc.data(), 0, ggml_nbytes(inp_embd_enc));
     }
 
-    if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
-        const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
+    if (!is_encoding && inp_KQ_mask_cross) {
+        const int64_t n_output_enc = embd_enc.size() / hparams.n_embd;
         const int64_t n_tokens = ubatch.n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(inp_KQ_mask_cross->buffer));
         GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
 
-        float * data = (float *) lctx.inp_KQ_mask_cross->data;
+        float * data = (float *) inp_KQ_mask_cross->data;
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
@@ -433,7 +465,7 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
                     float f = -INFINITY;
                     for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
                         const llama_seq_id seq_id = ubatch.seq_id[j][s];
-                        if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
+                        if (seq_ids_enc[i].find(seq_id) != seq_ids_enc[i].end()) {
                             f = 0.0f;
                         }
                     }
@@ -450,6 +482,851 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
     }
 }
 
+// do mat_mul, while optionally apply lora
+ggml_tensor * llama_context::build_lora_mm(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur) {
+    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float adapter_scale = lora.second;
+        const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat(
+            ctx0, lw->b,
+            ggml_mul_mat(ctx0, lw->a, cur)
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+// do mat_mul_id, while optionally apply lora
+ggml_tensor * llama_context::build_lora_mm_id(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur,
+         ggml_tensor * ids) {
+    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float alpha = lora.first->alpha;
+        const float rank  = (float) lw->b->ne[0];
+        const float scale = alpha ? lora.second * alpha / rank : lora.second;
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+            ctx0, lw->b,
+            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+            ids
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+void llama_context::build_attn_inp(
+        ggml_context * ctx0,
+             int32_t   n_tokens,
+                bool   causal,
+                bool   swa,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto n_kv = worst_case ? kv_self.size : kv_self.n;
+
+    inp_KQ_mask = causal
+        ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+        : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    //cb(inp_KQ_mask, "KQ_mask", -1);
+    ggml_set_input(inp_KQ_mask);
+
+    inp_KQ_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask, GGML_TYPE_F16) : inp_KQ_mask;
+
+    if (swa) {
+        GGML_ASSERT(hparams.n_swa > 0);
+
+        inp_KQ_mask_swa = causal
+            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
+            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+        //cb(inp_KQ_mask_swa, "KQ_mask_swa", -1);
+        ggml_set_input(inp_KQ_mask_swa);
+
+        inp_KQ_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_KQ_mask_swa, GGML_TYPE_F16) : inp_KQ_mask_swa;
+    }
+}
+
+void llama_context::build_attn_kv_store(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+             int32_t   n_tokens,
+             int64_t   il,
+             bool      worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_ctx = cparams.n_ctx;
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+    GGML_ASSERT(kv_self.size == n_ctx);
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)*kv_head);
+    //cb(k_cache_view, "k_cache_view", il);
+
+    // note: storing RoPE-ed version of K in the KV cache
+    ggml_build_forward_expand(graph, ggml_cpy(ctx0, k_cur, k_cache_view));
+
+    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+
+    struct ggml_tensor * v_cache_view = nullptr;
+
+    if (cparams.flash_attn) {
+        v_cache_view = ggml_view_1d(ctx0, kv_self.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa)*kv_head);
+    } else {
+        // note: the V cache is transposed when not using flash attention
+        v_cache_view = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_v_gqa,
+                (  n_ctx)*ggml_element_size(kv_self.v_l[il]),
+                (kv_head)*ggml_element_size(kv_self.v_l[il]));
+
+        v_cur = ggml_transpose(ctx0, v_cur);
+    }
+    //cb(v_cache_view, "v_cache_view", il);
+
+    ggml_build_forward_expand(graph, ggml_cpy(ctx0, v_cur, v_cache_view));
+}
+
+ggml_tensor * llama_context::build_attn_qkv(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * wo,
+         ggml_tensor * wo_b,
+         ggml_tensor * q_cur,
+             int32_t   n_tokens,
+             float     kq_scale,
+             int       il,
+             bool      worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_ctx         = cparams.n_ctx;
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+    const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    // TODO: improve
+    bool is_sliding = false;
+
+    switch (model.arch) {
+        case LLM_ARCH_COHERE2:
+            {
+                const int32_t sliding_window_pattern = 4;
+                is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
+            } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                const int32_t sliding_window_pattern = 2;
+                is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
+            } break;
+        case LLM_ARCH_PHI3:
+            {
+                is_sliding = hparams.n_swa > 0;
+            } break;
+        default:
+            {
+                is_sliding = false;
+            }
+    };
+
+    const auto & kq_mask = is_sliding ? inp_KQ_mask_swa_cnv : inp_KQ_mask_cnv;
+
+    const auto n_kv = worst_case ? kv_self.size : kv_self.n;
+
+    const int64_t n_head       = hparams.n_head(il);
+    const int64_t n_head_kv    = hparams.n_head_kv(il);
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+    struct ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    struct ggml_tensor * k =
+        ggml_view_3d(ctx0, kv_self.k_l[il],
+                n_embd_head_k, n_kv, n_head_kv,
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                0);
+    //cb(k, "k", il);
+
+    struct ggml_tensor * cur;
+
+    if (cparams.flash_attn) {
+        GGML_UNUSED(model);
+        GGML_UNUSED(n_ctx);
+
+        // split cached v into n_head heads (not transposed)
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx0, kv_self.v_l[il],
+                    n_embd_head_v, n_kv, n_head_kv,
+                    ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                    ggml_row_size(kv_self.v_l[il]->type, n_embd_head_v),
+                    0);
+        //cb(v, "v", il);
+
+        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
+                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
+
+        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
+
+        cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
+    } else {
+        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+        //cb(kq, "kq", il);
+
+        // note: this op tends to require high floating point range
+        //       while for some models F16 is enough, for others it is not, so we default to F32 here
+        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
+
+        if (model.arch == LLM_ARCH_GROK) {
+            // need to do the following:
+            // multiply by attn_output_multiplyer of 0.08838834764831845
+            // and then :
+            // kq = 30 * tanh(kq / 30)
+            // before the softmax below
+
+            kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
+            kq = ggml_scale(ctx0, kq, 30);
+        }
+
+        if (hparams.attn_soft_cap) {
+            kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
+            kq = ggml_tanh(ctx0, kq);
+            kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
+        }
+
+        kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
+        //cb(kq, "kq_soft_max_ext", il);
+
+        GGML_ASSERT(kv_self.size == n_ctx);
+
+        // split cached v into n_head heads
+        struct ggml_tensor * v =
+            ggml_view_3d(ctx0, kv_self.v_l[il],
+                    n_kv, n_embd_head_v, n_head_kv,
+                    ggml_element_size(kv_self.v_l[il])*n_ctx,
+                    ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+                    0);
+        //cb(v, "v", il);
+
+        struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+        //cb(kqv, "kqv", il);
+
+        struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+        //cb(kqv_merged, "kqv_merged", il);
+
+        cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
+        //cb(cur, "kqv_merged_cont", il);
+
+        if (!cparams.offload_kqv) {
+            // all nodes between the KV store and the attention output are run on the CPU
+            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
+        }
+    }
+
+    ggml_build_forward_expand(graph, cur);
+
+    if (wo) {
+        cur = build_lora_mm(ctx0, wo, cur);
+    }
+
+    if (wo_b) {
+        //cb(cur, "kqv_wo", il);
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
+ggml_tensor * llama_context::build_soft_max_ext(
+        ggml_context * ctx0,
+         ggml_tensor * kq,
+             float     kq_scale) {
+    const auto & hparams = model.hparams;
+
+    return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
+}
+
+ggml_tensor * llama_context::get_rope_factors(int il) {
+    const auto & hparams = model.hparams;
+
+    // choose long/short freq factors based on the context size
+    const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    if (model.layers[il].rope_freqs != nullptr) {
+        return model.layers[il].rope_freqs;
+    }
+
+    if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
+        return model.layers[il].rope_long;
+    }
+
+    return model.layers[il].rope_short;
+}
+
+void llama_context::build_k_shift(
+        ggml_context * ctx0,
+         ggml_cgraph * graph) {
+    const auto & n_ctx      = cparams.n_ctx;
+    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
+    const auto & freq_base  = cparams.rope_freq_base;
+    const auto & freq_scale = cparams.rope_freq_scale;
+
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+
+    const auto & hparams = model.hparams;
+
+    const auto & n_rot     = hparams.n_rot;
+    const auto & n_layer   = hparams.n_layer;
+    const auto & rope_type = hparams.rope_type;
+
+    const auto & n_embd_head_k = hparams.n_embd_head_k;
+  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+
+    GGML_ASSERT(kv_self.size == n_ctx);
+
+    inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+    //cb(inp_K_shift, "K_shift", -1);
+    ggml_set_input(inp_K_shift);
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const int64_t n_head_kv    = hparams.n_head_kv(il);
+        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+
+        struct ggml_tensor * rope_factors = get_rope_factors(il);
+
+        struct ggml_tensor * k =
+            ggml_view_3d(ctx0, kv_self.k_l[il],
+                n_embd_head_k, n_head_kv, n_ctx,
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                0);
+
+        struct ggml_tensor * tmp;
+        if (ggml_is_quantized(k->type)) {
+            // dequantize to f32 -> RoPE -> quantize back
+            tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
+            //cb(tmp, "K_f32", il);
+
+            for (auto & backend : backends) {
+                // Figure out which backend KV cache belongs to
+                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
+                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
+                    break;
+                }
+            }
+            tmp = ggml_rope_ext_inplace(ctx0, tmp,
+                    inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+            //cb(tmp, "K_shifted_f32", il);
+
+            tmp = ggml_cpy(ctx0, tmp, k);
+        } else {
+            // we rotate only the first n_rot dimensions
+            tmp = ggml_rope_ext_inplace(ctx0, k,
+                    inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+        }
+        //cb(tmp, "K_shifted", il);
+
+        ggml_build_forward_expand(graph, tmp);
+    }
+}
+
+void llama_context::build_defrag(
+        ggml_context * ctx0,
+         ggml_cgraph * graph) {
+    const auto & hparams = model.hparams;
+
+    const uint32_t n_layer = hparams.n_layer;
+
+    const uint32_t n_kv   = kv_self.cell_max();
+    const uint32_t n_used = kv_self.used;
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see build_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
+    std::vector<uint32_t> ids(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        const auto & cell0 = kv_self.cells[i0];
+
+        if (!cell0.is_empty()) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            const auto & cell1 = kv_self.cells[is];
+
+            if (cell1.is_empty() || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            auto & cell1 = kv_self.cells[i1];
+
+            if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            kv_self.cells[i0 + nf] = cell1;
+
+            // clear the old cell and move the head there
+            cell1 = llama_kv_cell();
+            kv_self.head = n_used;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return;
+    }
+
+    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
+
+    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
+
+#if 0
+    // CPU defrag
+    //
+    // TODO: optimizations are possible:
+    //       - multiple threads
+    //       - avoid copying to the host memory when already there
+    //
+    // likely not worth the effort, as we have ggml_graph based defrag
+    //
+
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+
+    const uint32_t kv_size = kv_self.size;
+
+    std::vector<uint8_t> buf_k;
+    std::vector<uint8_t> buf_v;
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
+
+        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
+
+        buf_k.resize(k_size);
+        buf_v.resize(v_size);
+
+        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
+
+        // batch move [i, i+nm) to [id, id+nm)
+        // note: cells can move only to a lower index
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            const uint32_t id = ids[i];
+
+            if (i == id || id == n_kv) {
+                continue;
+            }
+
+            uint32_t nm = 1;
+
+            while (i + nm < n_kv && ids[i + nm] == id + nm) {
+                nm++;
+            }
+
+            // move keys
+            {
+                const int64_t os =  i*k_size_row;
+                const int64_t od = id*k_size_row;
+
+                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
+            }
+
+            // move values (note: they are transposed)
+            {
+                const int64_t os =  i;
+                const int64_t od = id;
+
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
+                }
+            }
+
+            i += nm - 1;
+        }
+
+        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
+        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
+    }
+#else
+    for (uint32_t i = 0; i < ids.size(); ++i) {
+        const uint32_t id = ids[i];
+
+        if (i == id || id == ids.size()) {
+            continue;
+        }
+
+        uint32_t nm = 1;
+
+        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
+            nm++;
+        }
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+
+            ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+
+            ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
+                    n_embd_k_gqa, nm,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+
+            ggml_tensor * view_v_src;
+            ggml_tensor * view_v_dst;
+
+            if (cparams.flash_attn) {
+                // NOTE: the V cache is not transposed when using flash attention
+                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+
+                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        n_embd_v_gqa, nm,
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
+                        ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+            } else {
+                view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, i));
+
+                view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
+                        nm, n_embd_v_gqa,
+                        ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
+                        ggml_row_size(kv_self.v_l[il]->type, id));
+            }
+
+            ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst));
+            ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst));
+        }
+
+        i += nm - 1;
+    }
+
+    //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes);
+#endif
+}
+
+ggml_tensor * llama_context::build_inp_s_copy(
+        ggml_context * ctx0,
+                bool   worst_case) {
+    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+
+    inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+    //cb(inp_s_copy, "inp_s_copy", -1);
+    ggml_set_input(inp_s_copy);
+    return inp_s_copy;
+}
+
+ggml_tensor * llama_context::build_inp_s_mask(
+        ggml_context * ctx0,
+                bool   worst_case) {
+    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+    inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+    //cb(inp_s_mask, "inp_s_mask", -1);
+    ggml_set_input(inp_s_mask);
+    return inp_s_mask;
+}
+
+ggml_tensor * llama_context::build_copy_mask_state(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * s,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+             int32_t   n_tokens,
+             int32_t   n_state,
+             int32_t   n_seqs,
+                bool   worst_case) {
+    const auto n_kv    = worst_case ? kv_self.size : kv_self.n;
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    struct ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self.size);
+
+    // copy states
+    // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
+    // this shrinks the tensors's ne[1] to n_kv
+    states = ggml_get_rows(ctx0, states, state_copy);
+
+    // clear states of sequences which are starting at the beginning of this batch
+    // FIXME: zero-out NANs?
+    states = ggml_mul(ctx0, states, state_mask);
+
+    // copy states which won't be changed further (between n_seqs and n_kv)
+    ggml_build_forward_expand(graph,
+        ggml_cpy(ctx0,
+            ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs          )*n_state*ggml_element_size(states)),
+            ggml_view_1d(ctx0, s,      n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
+
+    // the part of the states that will be used and modified
+    return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
+}
+
+// TODO: split
+ggml_tensor * llama_context::build_mamba_layer(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * cur,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto & n_tokens = ubatch.n_tokens;
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    const int64_t d_conv  = hparams.ssm_d_conv;
+    const int64_t d_inner = hparams.ssm_d_inner;
+    const int64_t d_state = hparams.ssm_d_state;
+    const int64_t dt_rank = hparams.ssm_dt_rank;
+    const int64_t n_seqs  = ubatch.n_seqs;
+    // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
+    const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
+    // Use the same RMS norm as the final layer norm
+    const float norm_rms_eps = hparams.f_norm_rms_eps;
+
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    GGML_ASSERT(n_seqs != 0);
+    GGML_ASSERT(ubatch.equal_seqs);
+    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+    struct ggml_tensor * conv_states_all = kv_self.k_l[il];
+    struct ggml_tensor * ssm_states_all  = kv_self.v_l[il];
+
+    // (ab)using the KV cache to store the states
+    struct ggml_tensor * conv = build_copy_mask_state(
+            ctx0, graph, conv_states_all, state_copy, state_mask,
+            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
+    conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
+    struct ggml_tensor * ssm = build_copy_mask_state(
+            ctx0, graph, ssm_states_all, state_copy, state_mask,
+            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
+    ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
+
+    // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
+    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
+
+    // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * xz = build_lora_mm(ctx0, model.layers[il].ssm_in, cur);
+    // split the above in two
+    // => {d_inner, n_seq_tokens, n_seqs}
+    struct ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
+    struct ggml_tensor * z = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
+
+    // conv
+    {
+        // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
+        struct ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
+
+        // copy last (d_conv - 1) columns back into the state cache
+        struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
+
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx0, last_conv,
+                ggml_view_1d(ctx0, conv_states_all,
+                    (d_conv - 1)*(d_inner)*(n_seqs),
+                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
+
+        // 1D convolution
+        // The equivalent is to make a self-overlapping view of conv_x
+        // over d_conv columns at each stride in the 3rd dimension,
+        // then element-wise multiply that with the conv1d weight,
+        // then sum the elements of each row,
+        // (the last two steps are a dot product over rows (also doable with mul_mat))
+        // then permute away the ne[0] dimension,
+        // and then you're left with the resulting x tensor.
+        // For simultaneous sequences, all sequences need to have the same length.
+        x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
+
+        // bias
+        x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
+
+        x = ggml_silu(ctx0, x);
+    }
+
+    // ssm
+    {
+        // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
+        struct ggml_tensor * x_db = build_lora_mm(ctx0, model.layers[il].ssm_x, x);
+        // split
+        struct ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
+        struct ggml_tensor * B  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
+        struct ggml_tensor * C  = ggml_view_3d(ctx0, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
+
+        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
+        if (ssm_dt_b_c_rms) {
+            dt = ggml_rms_norm(ctx0, dt, norm_rms_eps);
+            B = ggml_rms_norm(ctx0, B, norm_rms_eps);
+            C = ggml_rms_norm(ctx0, C, norm_rms_eps);
+        }
+
+        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
+        dt = build_lora_mm(ctx0, model.layers[il].ssm_dt, dt);
+        dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
+
+        // Custom operator to optimize the parallel associative scan
+        // as described in the Annex D of the Mamba paper.
+        // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
+        struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
+
+        // store last states
+        ggml_build_forward_expand(graph,
+            ggml_cpy(ctx0,
+                ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
+                ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
+
+        struct ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
+
+        // TODO: skip computing output earlier for unused tokens
+
+        // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
+        y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
+        y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z)));
+
+        // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
+        cur = build_lora_mm(ctx0, model.layers[il].ssm_out, y);
+    }
+
+    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
+    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
+    //cb(cur, "mamba_out", il);
+
+    return cur;
+}
+
+
 // llama output
 
 size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 73baa711f..a2f41b5c8 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -14,6 +14,8 @@
 #include <vector>
 #include <set>
 
+using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
+
 struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
@@ -22,12 +24,10 @@ struct llama_context {
 
     const struct llama_model & model;
 
-    struct llama_cparams      cparams;
-    struct llama_sbatch       sbatch;  // TODO: revisit if needed
-    struct llama_kv_cache     kv_self;
-    struct llama_adapter_cvec cvec;
-
-    std::unordered_map<struct llama_adapter_lora *, float> lora;
+    llama_cparams      cparams;
+    llama_sbatch       sbatch;  // TODO: revisit if needed
+    llama_adapter_cvec cvec;
+    llama_loras        loras;
 
     std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
@@ -72,18 +72,6 @@ struct llama_context {
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
 
-    // whether we are computing encoder output or decoder output
-    bool is_encoding = false;
-
-    // TODO: find a better way to accommodate mutli-dimension position encoding methods
-    // number of position id each token get, 1 for each token in most cases.
-    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
-    int n_pos_per_token = 1;
-
-    // output of the encoder part of the encoder-decoder models
-    std::vector<float> embd_enc;
-    std::vector<std::set<llama_seq_id>> seq_ids_enc;
-
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
     ggml_backend_sched_ptr sched;
@@ -91,29 +79,145 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    void reset();
+
+    void prepare_k_shift();
+    void prepare_defrag();
+    void prepare_decode(const llama_ubatch & ubatch);
+
+    void set_inputs(const llama_ubatch & ubatch);
+
+    ggml_tensor * build_lora_mm(
+            ggml_context * ctx0,
+             ggml_tensor * w,
+             ggml_tensor * cur);
+
+    ggml_tensor * build_lora_mm_id(
+            ggml_context * ctx0,
+             ggml_tensor * w,   // struct ggml_tensor * as
+             ggml_tensor * cur, // struct ggml_tensor * b
+             ggml_tensor * ids);
+
     // input tensors
     struct ggml_tensor * inp_tokens;        // I32 [n_batch]
     struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
     struct ggml_tensor * inp_pos;           // I32 [n_batch]
     struct ggml_tensor * inp_out_ids;       // I32 [n_outputs]
-    struct ggml_tensor * inp_KQ_mask;       // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_KQ_mask_swa;   // F32 [kv_size, n_batch]
-    struct ggml_tensor * inp_K_shift;       // I32 [kv_size]
     struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
     struct ggml_tensor * inp_cls;           // I32 [n_batch]
+
+    // === encoder-decoder ===
+
+    // whether we are computing encoder output or decoder output
+    bool is_encoding = false;
+
+    // output of the encoder part of the encoder-decoder models
+    std::vector<float> embd_enc;
+    std::vector<std::set<llama_seq_id>> seq_ids_enc;
+
+    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
+
+    // === unified KV cache ===
+
+    llama_kv_cache     kv_self;
+
+    struct ggml_tensor * inp_KQ_mask;         // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_cnv;     //     [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_swa;     // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_swa_cnv; //     [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_mask_cross;   // F32 [n_outputs_enc, n_batch]
+    struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
+
+    void build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa,
+                    bool   worst_case);
+
+    void build_attn_kv_store(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * k_cur,
+             ggml_tensor * v_cur,
+                 int32_t   n_tokens,
+                 int64_t   il,
+                 bool      worst_case);
+
+    ggml_tensor * build_attn_qkv(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * wo,
+             ggml_tensor * wo_b,
+             ggml_tensor * q_cur,
+                 int32_t   n_tokens,
+                 float     kq_scale,
+                 int       il,
+                 bool      worst_case);
+
+    ggml_tensor * build_soft_max_ext(
+            ggml_context * ctx0,
+             ggml_tensor * kq,
+                 float     kq_scale);
+
+    ggml_tensor * get_rope_factors(int il);
+
+    void build_k_shift(
+            ggml_context * ctx0,
+             ggml_cgraph * graph);
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    void build_defrag(
+            ggml_context * ctx0,
+             ggml_cgraph * graph);
+
+    // === recurrent ===
+
+    // TODO: add recurrent cache
+    // TODO: add mamba-specific llama_context
+
+    // TODO: change these to build_mamba_inp and hide `state_copy` and `state_mask` inside the llama_context impl
+    ggml_tensor * build_inp_s_copy(
+            ggml_context * ctx0,
+                    bool   worst_case);
+
+    ggml_tensor * build_inp_s_mask(
+            ggml_context * ctx0,
+                    bool   worst_case);
+
+    ggml_tensor * build_copy_mask_state(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * s,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+                 int32_t   n_tokens,
+                 int32_t   n_state,
+                 int32_t   n_seqs,
+                    bool   worst_case);
+
+    ggml_tensor * build_mamba_layer(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
     struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
     struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
-    struct ggml_tensor * inp_s_seq;         // I32 [n_kv, n_batch]
-    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
-    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
-    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
 
-    void set_k_shift(llama_kv_cache & kv);
+    // === vision ===
+
+    // TODO: find a better way to accommodate mutli-dimension position encoding methods
+    // number of position id each token get, 1 for each token in most cases.
+    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
+    int n_pos_per_token = 1;
 };
 
-// TODO: make these methods of llama_context
-void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
-
 // Make sure enough space is available for outputs.
 // Returns max number of outputs for which space was reserved.
 size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
diff --git a/src/llama.cpp b/src/llama.cpp
index 37816ddc2..a2e5e0bea 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4,8 +4,6 @@
 #include "llama-mmap.h"
 #include "llama-context.h"
 #include "llama-vocab.h"
-#include "llama-sampling.h"
-#include "llama-kv-cache.h"
 #include "llama-model-loader.h"
 #include "llama-model.h"
 
@@ -106,946 +104,15 @@ enum llm_norm_type {
     LLM_NORM_GROUP,
 };
 
-static struct ggml_tensor * llm_build_inp_embd(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-        const llama_hparams & hparams,
-         const llama_ubatch & ubatch,
-         struct ggml_tensor * tok_embd,
-         const llm_build_cb & cb) {
-    const int64_t n_embd = hparams.n_embd;
-
-    struct ggml_tensor * inpL;
-
-    if (ubatch.token) {
-        lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
-        cb(lctx.inp_tokens, "inp_tokens", -1);
-        ggml_set_input(lctx.inp_tokens);
-
-        inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
-
-        // apply lora for embedding tokens if needed
-        for (auto & it : lctx.lora) {
-            struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd);
-            if (lw == nullptr) {
-                continue;
-            }
-            const float adapter_scale = it.second;
-            const float scale = lw->get_scale(it.first->alpha, adapter_scale);
-            struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
-                ctx, lw->b, // non-transposed lora_b
-                ggml_get_rows(ctx, lw->a, lctx.inp_tokens)
-            ), scale);
-            inpL = ggml_add(ctx, inpL, inpL_delta);
-        }
-    } else {
-        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        inpL = lctx.inp_embd;
-        ggml_set_input(lctx.inp_embd);
-    }
-
-    // For Granite architecture
-    if (hparams.f_embedding_scale != 0.0f) {
-        inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
-    }
-
-    cb(inpL, "inp_embd", -1);
-
-    return inpL;
-}
-
-static void llm_build_kv_store(
-        struct ggml_context * ctx,
-        const llama_hparams & hparams,
-        const llama_cparams & cparams,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-         const llm_build_cb & cb,
-                    int64_t   il) {
-    const int64_t n_ctx = cparams.n_ctx;
-
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-    GGML_ASSERT(kv.size == n_ctx);
-
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
-    cb(k_cache_view, "k_cache_view", il);
-
-    // note: storing RoPE-ed version of K in the KV cache
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
-
-    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
-
-    struct ggml_tensor * v_cache_view = nullptr;
-
-    if (cparams.flash_attn) {
-        v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
-    } else {
-        // note: the V cache is transposed when not using flash attention
-        v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
-                (  n_ctx)*ggml_element_size(kv.v_l[il]),
-                (kv_head)*ggml_element_size(kv.v_l[il]));
-
-        v_cur = ggml_transpose(ctx, v_cur);
-    }
-    cb(v_cache_view, "v_cache_view", il);
-
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
-}
-
-// do mat_mul, while optionally apply lora
-static struct ggml_tensor * llm_build_lora_mm(
-        struct llama_context & lctx,
-         struct ggml_context * ctx0,
-          struct ggml_tensor * w,
-          struct ggml_tensor * cur) {
-    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
-    for (auto & it : lctx.lora) {
-        struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-        const float adapter_scale = it.second;
-        const float scale = lw->get_scale(it.first->alpha, adapter_scale);
-        struct ggml_tensor * ab_cur = ggml_mul_mat(
-            ctx0, lw->b,
-            ggml_mul_mat(ctx0, lw->a, cur)
-        );
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-    return res;
-}
-
-// do mat_mul_id, while optionally apply lora
-static struct ggml_tensor * llm_build_lora_mm_id(
-        struct llama_context & lctx,
-         struct ggml_context * ctx0,
-          struct ggml_tensor * w,   // struct ggml_tensor * as
-          struct ggml_tensor * cur, // struct ggml_tensor * b
-          struct ggml_tensor * ids) {
-    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
-    for (auto & it : lctx.lora) {
-        struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-        const float alpha = it.first->alpha;
-        const float rank  = (float) lw->b->ne[0];
-        const float scale = alpha ? it.second * alpha / rank : it.second;
-        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
-            ctx0, lw->b,
-            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
-            ids
-        );
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-    return res;
-}
-
-static struct ggml_tensor * llm_build_norm(
-        struct ggml_context * ctx,
-         struct ggml_tensor * cur,
-        const llama_hparams & hparams,
-         struct ggml_tensor * mw,
-         struct ggml_tensor * mb,
-              llm_norm_type   type,
-         const llm_build_cb & cb,
-                        int   il) {
-    switch (type) {
-        case LLM_NORM:       cur = ggml_norm      (ctx, cur, hparams.f_norm_eps);     break;
-        case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx, cur, hparams.f_norm_rms_eps); break;
-        case LLM_NORM_GROUP:
-            {
-                cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
-                cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
-                cur = ggml_reshape_2d(ctx, cur, cur->ne[0],    cur->ne[2]);
-            } break;
-    }
-
-    if (mw || mb) {
-        cb(cur, "norm", il);
-    }
-
-    if (mw) {
-        cur = ggml_mul(ctx, cur, mw);
-        if (mb) {
-            cb(cur, "norm_w", il);
-        }
-    }
-
-    if (mb) {
-        cur = ggml_add(ctx, cur, mb);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_ffn(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * up,
-         struct ggml_tensor * up_b,
-         struct ggml_tensor * up_s,
-         struct ggml_tensor * gate,
-         struct ggml_tensor * gate_b,
-         struct ggml_tensor * gate_s,
-         struct ggml_tensor * down,
-         struct ggml_tensor * down_b,
-         struct ggml_tensor * down_s,
-         struct ggml_tensor * act_scales,
-            llm_ffn_op_type   type_op,
-          llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb,
-                        int   il) {
-    struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
-    cb(tmp, "ffn_up", il);
-
-    if (up_b) {
-        tmp = ggml_add(ctx, tmp, up_b);
-        cb(tmp, "ffn_up_b", il);
-    }
-
-    if (up_s) {
-        tmp = ggml_mul(ctx, tmp, up_s);
-        cb(tmp, "ffn_up_s", il);
-    }
-
-    if (gate) {
-        switch (type_gate) {
-            case LLM_FFN_SEQ:
-                {
-                    cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
-                    cb(cur, "ffn_gate", il);
-                } break;
-            case LLM_FFN_PAR:
-                {
-                    cur = llm_build_lora_mm(lctx, ctx, gate, cur);
-                    cb(cur, "ffn_gate", il);
-                } break;
-        }
-
-        if (gate_b) {
-            cur = ggml_add(ctx, cur, gate_b);
-            cb(cur, "ffn_gate_b", il);
-        }
-
-        if (gate_s) {
-            cur = ggml_mul(ctx, cur, gate_s);
-            cb(cur, "ffn_gate_s", il);
-        }
-
-    } else {
-        cur = tmp;
-    }
-
-    switch (type_op) {
-        case LLM_FFN_SILU:
-            {
-                cur = ggml_silu(ctx, cur);
-                cb(cur, "ffn_silu", il);
-            } break;
-        case LLM_FFN_GELU:
-            {
-                cur = ggml_gelu(ctx, cur);
-                cb(cur, "ffn_gelu", il);
-                if (act_scales != NULL) {
-                    cur = ggml_div(ctx, cur, act_scales);
-                    cb(cur, "ffn_act", il);
-                }
-            } break;
-        case LLM_FFN_RELU:
-            {
-                cur = ggml_relu(ctx, cur);
-                cb(cur, "ffn_relu", il);
-            } break;
-        case LLM_FFN_RELU_SQR:
-            {
-                cur = ggml_relu(ctx, cur);
-                cb(cur, "ffn_relu", il);
-
-                cur = ggml_sqr(ctx, cur);
-                cb(cur, "ffn_sqr(relu)", il);
-            } break;
-        case LLM_FFN_SWIGLU:
-            {
-                // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-                int64_t split_point = cur->ne[0] / 2;
-                struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
-                struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
-
-                x0 = ggml_silu(ctx, x0);
-                cb(cur, "ffn_silu", il);
-
-                cur = ggml_mul(ctx, x0, x1);
-                cb(cur, "ffn_mul", il);
-            } break;
-    }
-
-    if (type_gate == LLM_FFN_PAR) {
-        cur = ggml_mul(ctx, cur, tmp);
-        cb(cur, "ffn_gate_par", il);
-    }
-
-    if (down) {
-        cur = llm_build_lora_mm(lctx, ctx, down, cur);
-    }
-
-    if (down_b) {
-        cb(cur, "ffn_down", il);
-    }
-
-    if (down_b) {
-        cur = ggml_add(ctx, cur, down_b);
-    }
-
-    if (down_s) {
-        cur = ggml_mul(ctx, cur, down_s);
-        cb(cur, "ffn_down_s", il);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_moe_ffn(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * gate_inp,
-         struct ggml_tensor * up_exps,
-         struct ggml_tensor * gate_exps,
-         struct ggml_tensor * down_exps,
-         struct ggml_tensor * exp_probs_b,
-                    int64_t   n_expert,
-                    int64_t   n_expert_used,
-            llm_ffn_op_type   type_op,
-                       bool   norm_w,
-                       bool   scale_w,
-                      float   w_scale,
-llama_expert_gating_func_type gating_op,
-         const llm_build_cb & cb,
-                        int   il) {
-    int64_t n_embd = cur->ne[0];
-    int64_t n_tokens = cur->ne[1];
-
-    ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
-    cb(logits, "ffn_moe_logits", il);
-
-    ggml_tensor * probs = nullptr;
-    switch (gating_op) {
-        case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
-            {
-                probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
-            } break;
-        case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
-            {
-                probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
-            } break;
-        default:
-            GGML_ABORT("fatal error");
-    }
-    cb(probs, "ffn_moe_probs", il);
-
-    // add experts selection bias - introduced in DeepSeek V3
-    // leave probs unbiased as it's later used to get expert weights
-    ggml_tensor * selection_probs = probs;
-    if (exp_probs_b != nullptr) {
-        selection_probs = ggml_add(ctx, probs, exp_probs_b);
-        cb(selection_probs, "ffn_moe_probs_biased", il);
-    }
-
-    // select experts
-    ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
-    cb(selected_experts->src[0], "ffn_moe_argsort", il);
-    cb(selected_experts, "ffn_moe_topk", il);
-
-    ggml_tensor * weights = ggml_get_rows(ctx,
-            ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
-    cb(weights, "ffn_moe_weights", il);
-
-    if (norm_w) {
-        weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
-
-        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
-        cb(weights_sum, "ffn_moe_weights_sum", il);
-
-        weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
-        cb(weights, "ffn_moe_weights_norm", il);
-
-        weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
-    }
-    if (scale_w) {
-        weights = ggml_scale(ctx, weights, w_scale);
-        cb(weights, "ffn_moe_weights_scaled", il);
-    }
-
-    cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
-    ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(up, "ffn_moe_up", il);
-
-    ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(gate, "ffn_moe_gate", il);
-
-    switch (type_op) {
-        case LLM_FFN_SILU:
-            {
-                gate = ggml_silu(ctx, gate);
-                cb(gate, "ffn_moe_silu", il);
-            } break;
-        case LLM_FFN_GELU:
-            {
-                gate = ggml_gelu(ctx, gate);
-                cb(gate, "ffn_moe_gelu", il);
-            } break;
-        default:
-            GGML_ABORT("fatal error");
-    }
-
-    ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
-    cb(par, "ffn_moe_gate_par", il);
-
-    ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
-    cb(experts, "ffn_moe_down", il);
-
-    experts = ggml_mul(ctx, experts, weights);
-
-    // aggregate experts
-    ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
-        ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
-                experts->nb[2], i*experts->nb[1]);
-
-        if (i == 0) {
-            moe_out = cur_expert;
-        } else {
-            moe_out = ggml_add(ctx, moe_out, cur_expert);
-        }
-    }
-
-    if (n_expert_used == 1) {
-        // avoid returning a non-contiguous tensor
-        moe_out = ggml_cont(ctx, moe_out);
-    }
-
-    return moe_out;
-}
-
-static struct ggml_tensor * llm_build_kqv(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
-         struct ggml_tensor * q_cur,
-         struct ggml_tensor * kq_mask,
-                    int32_t   n_tokens,
-                    int32_t   n_kv,
-                    float     kq_scale,
-         const llm_build_cb & cb,
-                    int       il) {
-    const llama_model   & model   = lctx.model;
-    const llama_hparams & hparams = lctx.model.hparams;
-    const llama_cparams & cparams = lctx.cparams;
-
-    const int64_t n_ctx         = cparams.n_ctx;
-    const int64_t n_head        = hparams.n_head(il);
-    const int64_t n_head_kv     = hparams.n_head_kv(il);
-    const int64_t n_embd_head_k = hparams.n_embd_head_k;
-    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_head_v = hparams.n_embd_head_v;
-    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(il);
-
-    struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
-    cb(q, "q", il);
-
-    struct ggml_tensor * k =
-        ggml_view_3d(ctx, kv.k_l[il],
-                n_embd_head_k, n_kv, n_head_kv,
-                ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
-                ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
-                0);
-    cb(k, "k", il);
-
-    struct ggml_tensor * cur;
-
-    if (cparams.flash_attn) {
-        GGML_UNUSED(model);
-        GGML_UNUSED(n_ctx);
-
-        // split cached v into n_head heads (not transposed)
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx, kv.v_l[il],
-                    n_embd_head_v, n_kv, n_head_kv,
-                    ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
-                    ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
-                    0);
-        cb(v, "v", il);
-
-        cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
-                                  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
-
-        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
-
-        cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
-    } else {
-        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-        cb(kq, "kq", il);
-
-        // note: this op tends to require high floating point range
-        //       while for some models F16 is enough, for others it is not, so we default to F32 here
-        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-
-        if (model.arch == LLM_ARCH_GROK) {
-            // need to do the following:
-            // multiply by attn_output_multiplyer of 0.08838834764831845
-            // and then :
-            // kq = 30 * tanh(kq / 30)
-            // before the softmax below
-
-            kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
-            kq = ggml_scale(ctx, kq, 30);
-        }
-
-        if (hparams.attn_soft_cap) {
-            kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
-            kq = ggml_tanh(ctx, kq);
-            kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
-        }
-
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
-        cb(kq, "kq_soft_max_ext", il);
-
-        GGML_ASSERT(kv.size == n_ctx);
-
-        // split cached v into n_head heads
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx, kv.v_l[il],
-                    n_kv, n_embd_head_v, n_head_kv,
-                    ggml_element_size(kv.v_l[il])*n_ctx,
-                    ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
-                    0);
-        cb(v, "v", il);
-
-        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
-        cb(kqv, "kqv", il);
-
-        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
-        cb(kqv_merged, "kqv_merged", il);
-
-        cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
-        cb(cur, "kqv_merged_cont", il);
-    }
-
-    ggml_build_forward_expand(graph, cur);
-
-    if (wo) {
-        cur = llm_build_lora_mm(lctx, ctx, wo, cur);
-    }
-
-    if (wo_b) {
-        cb(cur, "kqv_wo", il);
-    }
-
-    if (wo_b) {
-        cur = ggml_add(ctx, cur, wo_b);
-    }
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_kv(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-       const llama_kv_cache & kv,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * wo,
-         struct ggml_tensor * wo_b,
-         struct ggml_tensor * k_cur,
-         struct ggml_tensor * v_cur,
-         struct ggml_tensor * q_cur,
-         struct ggml_tensor * kq_mask,
-                    int32_t   n_tokens,
-                    int32_t   kv_head,
-                    int32_t   n_kv,
-                    float     kq_scale,
-         const llm_build_cb & cb,
-                    int       il) {
-    const llama_hparams & hparams = lctx.model.hparams;
-    const llama_cparams & cparams = lctx.cparams;
-
-    // these nodes are added to the graph together so that they are not reordered
-    // by doing so, the number of splits in the graph is reduced
-    ggml_build_forward_expand(graph, q_cur);
-    ggml_build_forward_expand(graph, k_cur);
-    ggml_build_forward_expand(graph, v_cur);
-
-    llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
-
-    struct ggml_tensor * cur;
-
-    cur  = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
-    cb(cur, "kqv_out", il);
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_copy_mask_state(
-        struct ggml_context * ctx,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * s,
-         struct ggml_tensor * state_copy,
-         struct ggml_tensor * state_mask,
-                    int32_t   n_state,
-                    int32_t   kv_size,
-                    int32_t   kv_head,
-                    int32_t   n_kv,
-                    int32_t   n_seqs) {
-    struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size);
-
-    // copy states
-    // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
-    // this shrinks the tensors's ne[1] to n_kv
-    states = ggml_get_rows(ctx, states, state_copy);
-
-    // clear states of sequences which are starting at the beginning of this batch
-    // FIXME: zero-out NANs?
-    states = ggml_mul(ctx, states, state_mask);
-
-    // copy states which won't be changed further (between n_seqs and n_kv)
-    ggml_build_forward_expand(graph,
-        ggml_cpy(ctx,
-            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
-            ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
-
-    // the part of the states that will be used and modified
-    return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0);
-}
-
-// TODO: split
-static struct ggml_tensor * llm_build_mamba(
-        struct ggml_context * ctx,
-       struct llama_context & lctx,
-         const llama_ubatch & ubatch,
-         struct ggml_cgraph * graph,
-         struct ggml_tensor * cur,
-         struct ggml_tensor * state_copy,
-         struct ggml_tensor * state_mask,
-                    int32_t   kv_head,
-                    int32_t   n_kv,
-         const llm_build_cb & cb,
-                    int       il) {
-    const llama_model    & model   = lctx.model;
-    const llama_hparams  & hparams = model.hparams;
-    const llama_kv_cache & kv      = lctx.kv_self;
-    const int64_t d_conv  = hparams.ssm_d_conv;
-    const int64_t d_inner = hparams.ssm_d_inner;
-    const int64_t d_state = hparams.ssm_d_state;
-    const int64_t dt_rank = hparams.ssm_dt_rank;
-    const int64_t n_seqs  = ubatch.n_seqs;
-    // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
-    const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
-    // Use the same RMS norm as the final layer norm
-    const float norm_rms_eps = hparams.f_norm_rms_eps;
-
-    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
-    GGML_ASSERT(n_seqs != 0);
-    GGML_ASSERT(ubatch.equal_seqs);
-    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
-    struct ggml_tensor * conv_states_all = kv.k_l[il];
-    struct ggml_tensor * ssm_states_all  = kv.v_l[il];
-
-    // (ab)using the KV cache to store the states
-    struct ggml_tensor * conv = llm_build_copy_mask_state(ctx,
-            graph, conv_states_all, state_copy, state_mask,
-            hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs);
-    conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs);
-    struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx,
-            graph, ssm_states_all, state_copy, state_mask,
-            hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs);
-    ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs);
-
-    // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
-    cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs);
-
-    // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
-    struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur);
-    // split the above in two
-    // => {d_inner, n_seq_tokens, n_seqs}
-    struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
-    struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
-
-    // conv
-    {
-        // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
-        struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0);
-
-        // copy last (d_conv - 1) columns back into the state cache
-        struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
-
-        ggml_build_forward_expand(graph,
-            ggml_cpy(ctx, last_conv,
-                ggml_view_1d(ctx, conv_states_all,
-                    (d_conv - 1)*(d_inner)*(n_seqs),
-                    kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
-
-        // 1D convolution
-        // The equivalent is to make a self-overlapping view of conv_x
-        // over d_conv columns at each stride in the 3rd dimension,
-        // then element-wise multiply that with the conv1d weight,
-        // then sum the elements of each row,
-        // (the last two steps are a dot product over rows (also doable with mul_mat))
-        // then permute away the ne[0] dimension,
-        // and then you're left with the resulting x tensor.
-        // For simultaneous sequences, all sequences need to have the same length.
-        x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
-
-        // bias
-        x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b);
-
-        x = ggml_silu(ctx, x);
-    }
-
-    // ssm
-    {
-        // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
-        struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x);
-        // split
-        struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
-        struct ggml_tensor * B  = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
-        struct ggml_tensor * C  = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
-
-        // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
-        if (ssm_dt_b_c_rms) {
-            dt = ggml_rms_norm(ctx, dt, norm_rms_eps);
-            B = ggml_rms_norm(ctx, B, norm_rms_eps);
-            C = ggml_rms_norm(ctx, C, norm_rms_eps);
-        }
-
-        // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
-        dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt);
-        dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b);
-
-        // Custom operator to optimize the parallel associative scan
-        // as described in the Annex D of the Mamba paper.
-        // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
-        struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
-
-        // store last states
-        ggml_build_forward_expand(graph,
-            ggml_cpy(ctx,
-                ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
-                ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
-
-        struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
-
-        // TODO: skip computing output earlier for unused tokens
-
-        // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
-        y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d));
-        y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z)));
-
-        // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
-        cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y);
-    }
-
-    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
-    cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs);
-    cb(cur, "mamba_out", il);
-
-    return cur;
-}
-
-static struct ggml_tensor * llm_build_rwkv6_time_mix(
-        struct llama_context & lctx,
-        struct ggml_context * ctx,
-        const struct llama_layer * layer,
-        struct ggml_tensor * cur,
-        struct ggml_tensor * x_prev,
-        struct ggml_tensor ** wkv_state,
-        size_t wkv_head_size,
-        size_t head_count_kv) {
-    size_t n_embd       = cur->ne[0];
-    size_t n_seq_tokens = cur->ne[1];
-    size_t n_seqs       = cur->ne[2];
-
-    size_t head_size  = wkv_head_size;
-    size_t head_count = n_embd / head_size;
-
-    size_t n_tokens = n_seqs * n_seq_tokens;
-
-    bool is_qrwkv = layer->time_mix_first == nullptr;
-
-    struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
-
-    sx  = ggml_reshape_2d(ctx, sx,  n_embd, n_tokens);
-    cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
-
-    struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
-
-    xxx = ggml_reshape_4d(
-        ctx,
-        ggml_tanh(
-            ctx,
-            ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
-        ),
-        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
-    );
-
-    xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2));
-
-    xxx = ggml_mul_mat(
-        ctx,
-        ggml_reshape_4d(
-            ctx,
-            layer->time_mix_w2,
-            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
-        ),
-        xxx
-    );
-
-    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
-    if (layer->time_mix_lerp_fused) {
-        // fusing these weights makes some performance improvement
-        sx  = ggml_reshape_3d(ctx, sx,  n_embd, 1, n_tokens);
-        cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
-        xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur);
-        xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-        xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-        xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-        xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-        xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-    } else {
-        // for backward compatibility
-        xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-        xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-        xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-        xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-        xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-
-        xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur);
-        xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur);
-        xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur);
-        xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur);
-        xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur);
-    }
-
-    struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr);
-    struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key,        xk);
-    struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value,      xv);
-    if (layer->time_mix_receptance_b) {
-        r = ggml_add(ctx, r, layer->time_mix_receptance_b);
-    }
-    if (layer->time_mix_key_b) {
-        k = ggml_add(ctx, k, layer->time_mix_key_b);
-    }
-    if (layer->time_mix_value_b) {
-        v = ggml_add(ctx, v, layer->time_mix_value_b);
-    }
-
-    struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg);
-    if (is_qrwkv) {
-        g = ggml_sigmoid(ctx, g);
-    } else {
-        g = ggml_silu(ctx, g);
-    }
-
-    if (head_count_kv != head_count) {
-        GGML_ASSERT(head_count % head_count_kv == 0);
-        k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens);
-        v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens);
-        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
-        k = ggml_repeat(ctx, k, tmp);
-        v = ggml_repeat(ctx, v, tmp);
-    }
-
-    k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens);
-    v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens);
-    r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens);
-
-    struct ggml_tensor * w = ggml_mul_mat(
-        ctx,
-        layer->time_mix_decay_w2,
-        ggml_tanh(
-            ctx,
-            ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
-        )
-    );
-
-    w = ggml_add(ctx, w, layer->time_mix_decay);
-    w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
-    w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens);
-
-    if (is_qrwkv) {
-        // k = k * (1 - w)
-        k = ggml_sub(ctx, k, ggml_mul(ctx, k, w));
-    }
-
-    struct ggml_tensor * wkv_output;
-    if (!layer->time_mix_first) {
-        wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
-    } else {
-        wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
-    }
-    cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
-    *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
-
-    if (!is_qrwkv) {
-        // group norm with head_count groups
-        cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
-        cur = ggml_norm(ctx, cur, 64e-5f);
-
-        // Convert back to regular vectors.
-        cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
-        cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
-    } else {
-        cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
-    }
-
-    cur = ggml_mul(ctx, cur, g);
-    cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
-
-    return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
-}
-
-static struct ggml_tensor * llm_build_rwkv6_channel_mix(
-        struct llama_context & lctx,
-        struct ggml_context * ctx,
-        const struct llama_layer * layer,
-        struct ggml_tensor * cur,
-        struct ggml_tensor * x_prev) {
-    struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
-    struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
-    struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
-
-    struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
-    struct ggml_tensor * k = ggml_sqr(
-        ctx,
-        ggml_relu(
-            ctx,
-            llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
-        )
-    );
-
-    return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
-}
-
 struct llm_build_context {
-    const llama_model    & model;
-          llama_context  & lctx;
-    const llama_hparams  & hparams;
-    const llama_cparams  & cparams;
-    const llama_ubatch   & ubatch;
-    const llama_kv_cache & kv_self;
+          llama_context      & lctx;
+    const llama_model        & model;
+    const llama_hparams      & hparams;
+    const llama_cparams      & cparams;
+    const llama_ubatch       & ubatch;
+    //const llama_kv_cache     & kv_self;
+    const llama_adapter_cvec & cvec;
+    const llama_loras        & loras;
 
     const int64_t n_embd;
     const int64_t n_layer;
@@ -1070,12 +137,13 @@ struct llm_build_context {
     const float norm_rms_eps;
 
     const int32_t n_tokens;
-    const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
+    //const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
+    //const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_outputs;
     const int32_t n_outputs_enc;
-    const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_ctx_orig;
 
+    const bool worst_case;
     const bool flash_attn;
 
     const enum llama_pooling_type pooling_type;
@@ -1089,16 +157,18 @@ struct llm_build_context {
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-        llama_context  & lctx,
-    const llama_ubatch & ubatch,
-    const llm_build_cb & cb,
-                  bool   worst_case) :
-        model            (lctx.model),
+            llama_context & lctx,
+       const llama_ubatch & ubatch,
+       const llm_build_cb & cb,
+                     bool   worst_case) :
         lctx             (lctx),
+        model            (lctx.model),
         hparams          (model.hparams),
         cparams          (lctx.cparams),
         ubatch           (ubatch),
-        kv_self          (lctx.kv_self),
+        //kv_self          (lctx.kv_self),
+        cvec             (lctx.cvec),
+        loras            (lctx.loras),
         n_embd           (hparams.n_embd),
         n_layer          (hparams.n_layer),
         n_rot            (hparams.n_rot),
@@ -1120,11 +190,12 @@ struct llm_build_context {
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (ubatch.n_tokens),
-        n_kv             (worst_case ? kv_self.size : kv_self.n),
+        //n_kv             (worst_case ? kv_self.size : kv_self.n),
+        //kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
         n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
-        kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_ctx_orig       (cparams.n_ctx_orig_yarn),
+        worst_case       (worst_case),
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
@@ -1142,21 +213,7 @@ struct llm_build_context {
 
         ctx0 = ggml_init(params);
 
-        lctx.inp_tokens        = nullptr;
-        lctx.inp_embd          = nullptr;
-        lctx.inp_pos           = nullptr;
-        lctx.inp_out_ids       = nullptr;
-        lctx.inp_KQ_mask       = nullptr;
-        lctx.inp_KQ_mask_swa   = nullptr;
-        lctx.inp_K_shift       = nullptr;
-        lctx.inp_mean          = nullptr;
-        lctx.inp_cls           = nullptr;
-        lctx.inp_s_copy        = nullptr;
-        lctx.inp_s_mask        = nullptr;
-        lctx.inp_s_seq         = nullptr;
-        lctx.inp_pos_bucket    = nullptr;
-        lctx.inp_embd_enc      = nullptr;
-        lctx.inp_KQ_mask_cross = nullptr;
+        lctx.reset();
     }
 
     void free() {
@@ -1164,125 +221,597 @@ struct llm_build_context {
         ctx0 = nullptr;
     }
 
+    struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
+        struct ggml_tensor * inpL;
+
+        if (ubatch.token) {
+            lctx.inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+            cb(lctx.inp_tokens, "inp_tokens", -1);
+            ggml_set_input(lctx.inp_tokens);
+
+            inpL = ggml_get_rows(ctx0, tok_embd, lctx.inp_tokens);
+
+            // apply lora for embedding tokens if needed
+            for (const auto & lora : loras) {
+                struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
+                if (lw == nullptr) {
+                    continue;
+                }
+
+                const float adapter_scale = lora.second;
+                const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+                struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
+                            ctx0, lw->b, // non-transposed lora_b
+                            ggml_get_rows(ctx0, lw->a, lctx.inp_tokens)
+                            ), scale);
+
+                inpL = ggml_add(ctx0, inpL, inpL_delta);
+            }
+        } else {
+            lctx.inp_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+            inpL = lctx.inp_embd;
+            ggml_set_input(lctx.inp_embd);
+        }
+
+        // For Granite architecture
+        if (hparams.f_embedding_scale != 0.0f) {
+            inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
+        }
+
+        cb(inpL, "inp_embd", -1);
+
+        return inpL;
+    }
+
+    // do mat_mul, while optionally apply lora
+    struct ggml_tensor * build_lora_mm(
+              struct ggml_tensor * w,
+              struct ggml_tensor * cur) {
+        struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+        for (const auto & lora : loras) {
+            struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+            if (lw == nullptr) {
+                continue;
+            }
+
+            const float adapter_scale = lora.second;
+            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+            struct ggml_tensor * ab_cur = ggml_mul_mat(
+                ctx0, lw->b,
+                ggml_mul_mat(ctx0, lw->a, cur)
+            );
+
+            ab_cur = ggml_scale(ctx0, ab_cur, scale);
+            res = ggml_add(ctx0, res, ab_cur);
+        }
+
+        return res;
+    }
+
+    // do mat_mul_id, while optionally apply lora
+    struct ggml_tensor * build_lora_mm_id(
+              struct ggml_tensor * w,   // struct ggml_tensor * as
+              struct ggml_tensor * cur, // struct ggml_tensor * b
+              struct ggml_tensor * ids) {
+        struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+        for (const auto & lora : loras) {
+            struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+            if (lw == nullptr) {
+                continue;
+            }
+
+            const float alpha = lora.first->alpha;
+            const float rank  = (float) lw->b->ne[0];
+            const float scale = alpha ? lora.second * alpha / rank : lora.second;
+
+            struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+                ctx0, lw->b,
+                ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+                ids
+            );
+
+            ab_cur = ggml_scale(ctx0, ab_cur, scale);
+            res = ggml_add(ctx0, res, ab_cur);
+        }
+
+        return res;
+    }
+
+    struct ggml_tensor * build_norm(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * mw,
+             struct ggml_tensor * mb,
+                  llm_norm_type   type,
+                            int   il) {
+        switch (type) {
+            case LLM_NORM:       cur = ggml_norm      (ctx0, cur, hparams.f_norm_eps);     break;
+            case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx0, cur, hparams.f_norm_rms_eps); break;
+            case LLM_NORM_GROUP:
+                {
+                    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
+                    cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
+                    cur = ggml_reshape_2d(ctx0, cur, cur->ne[0],    cur->ne[2]);
+                } break;
+        }
+
+        if (mw || mb) {
+            cb(cur, "norm", il);
+        }
+
+        if (mw) {
+            cur = ggml_mul(ctx0, cur, mw);
+            if (mb) {
+                cb(cur, "norm_w", il);
+            }
+        }
+
+        if (mb) {
+            cur = ggml_add(ctx0, cur, mb);
+        }
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_ffn(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * up,
+             struct ggml_tensor * up_b,
+             struct ggml_tensor * up_s,
+             struct ggml_tensor * gate,
+             struct ggml_tensor * gate_b,
+             struct ggml_tensor * gate_s,
+             struct ggml_tensor * down,
+             struct ggml_tensor * down_b,
+             struct ggml_tensor * down_s,
+             struct ggml_tensor * act_scales,
+                llm_ffn_op_type   type_op,
+              llm_ffn_gate_type   type_gate,
+             const llm_build_cb & cb,
+                            int   il) {
+        struct ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
+        cb(tmp, "ffn_up", il);
+
+        if (up_b) {
+            tmp = ggml_add(ctx0, tmp, up_b);
+            cb(tmp, "ffn_up_b", il);
+        }
+
+        if (up_s) {
+            tmp = ggml_mul(ctx0, tmp, up_s);
+            cb(tmp, "ffn_up_s", il);
+        }
+
+        if (gate) {
+            switch (type_gate) {
+                case LLM_FFN_SEQ:
+                    {
+                        cur = build_lora_mm(gate, tmp);
+                        cb(cur, "ffn_gate", il);
+                    } break;
+                case LLM_FFN_PAR:
+                    {
+                        cur = build_lora_mm(gate, cur);
+                        cb(cur, "ffn_gate", il);
+                    } break;
+            }
+
+            if (gate_b) {
+                cur = ggml_add(ctx0, cur, gate_b);
+                cb(cur, "ffn_gate_b", il);
+            }
+
+            if (gate_s) {
+                cur = ggml_mul(ctx0, cur, gate_s);
+                cb(cur, "ffn_gate_s", il);
+            }
+
+        } else {
+            cur = tmp;
+        }
+
+        switch (type_op) {
+            case LLM_FFN_SILU:
+                {
+                    cur = ggml_silu(ctx0, cur);
+                    cb(cur, "ffn_silu", il);
+                } break;
+            case LLM_FFN_GELU:
+                {
+                    cur = ggml_gelu(ctx0, cur);
+                    cb(cur, "ffn_gelu", il);
+                    if (act_scales != NULL) {
+                        cur = ggml_div(ctx0, cur, act_scales);
+                        cb(cur, "ffn_act", il);
+                    }
+                } break;
+            case LLM_FFN_RELU:
+                {
+                    cur = ggml_relu(ctx0, cur);
+                    cb(cur, "ffn_relu", il);
+                } break;
+            case LLM_FFN_RELU_SQR:
+                {
+                    cur = ggml_relu(ctx0, cur);
+                    cb(cur, "ffn_relu", il);
+
+                    cur = ggml_sqr(ctx0, cur);
+                    cb(cur, "ffn_sqr(relu)", il);
+                } break;
+            case LLM_FFN_SWIGLU:
+                {
+                    // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+                    int64_t split_point = cur->ne[0] / 2;
+                    struct ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                    struct ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                    x0 = ggml_silu(ctx0, x0);
+                    cb(cur, "ffn_silu", il);
+
+                    cur = ggml_mul(ctx0, x0, x1);
+                    cb(cur, "ffn_mul", il);
+                } break;
+        }
+
+        if (type_gate == LLM_FFN_PAR) {
+            cur = ggml_mul(ctx0, cur, tmp);
+            cb(cur, "ffn_gate_par", il);
+        }
+
+        if (down) {
+            cur = build_lora_mm(down, cur);
+        }
+
+        if (down_b) {
+            cb(cur, "ffn_down", il);
+        }
+
+        if (down_b) {
+            cur = ggml_add(ctx0, cur, down_b);
+        }
+
+        if (down_s) {
+            cur = ggml_mul(ctx0, cur, down_s);
+            cb(cur, "ffn_down_s", il);
+        }
+
+        return cur;
+    }
+
+    struct ggml_tensor * build_moe_ffn(
+             struct ggml_tensor * cur,
+             struct ggml_tensor * gate_inp,
+             struct ggml_tensor * up_exps,
+             struct ggml_tensor * gate_exps,
+             struct ggml_tensor * down_exps,
+             struct ggml_tensor * exp_probs_b,
+                        int64_t   n_expert,
+                        int64_t   n_expert_used,
+                llm_ffn_op_type   type_op,
+                           bool   norm_w,
+                           bool   scale_w,
+                          float   w_scale,
+  llama_expert_gating_func_type   gating_op,
+             const llm_build_cb & cb,
+                            int   il) {
+        int64_t n_embd = cur->ne[0];
+        int64_t n_tokens = cur->ne[1];
+
+        ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
+        cb(logits, "ffn_moe_logits", il);
+
+        ggml_tensor * probs = nullptr;
+        switch (gating_op) {
+            case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
+                {
+                    probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
+                } break;
+            case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
+                {
+                    probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
+                } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
+        cb(probs, "ffn_moe_probs", il);
+
+        // add experts selection bias - introduced in DeepSeek V3
+        // leave probs unbiased as it's later used to get expert weights
+        ggml_tensor * selection_probs = probs;
+        if (exp_probs_b != nullptr) {
+            selection_probs = ggml_add(ctx0, probs, exp_probs_b);
+            cb(selection_probs, "ffn_moe_probs_biased", il);
+        }
+
+        // select experts
+        ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+        cb(selected_experts->src[0], "ffn_moe_argsort", il);
+        cb(selected_experts, "ffn_moe_topk", il);
+
+        ggml_tensor * weights = ggml_get_rows(ctx0,
+                ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+        cb(weights, "ffn_moe_weights", il);
+
+        if (norm_w) {
+            weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
+
+            ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
+            cb(weights_sum, "ffn_moe_weights_sum", il);
+
+            weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
+            cb(weights, "ffn_moe_weights_norm", il);
+
+            weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
+        }
+        if (scale_w) {
+            weights = ggml_scale(ctx0, weights, w_scale);
+            cb(weights, "ffn_moe_weights_scaled", il);
+        }
+
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+        ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(up, "ffn_moe_up", il);
+
+        ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(gate, "ffn_moe_gate", il);
+
+        switch (type_op) {
+            case LLM_FFN_SILU:
+                {
+                    gate = ggml_silu(ctx0, gate);
+                    cb(gate, "ffn_moe_silu", il);
+                } break;
+            case LLM_FFN_GELU:
+                {
+                    gate = ggml_gelu(ctx0, gate);
+                    cb(gate, "ffn_moe_gelu", il);
+                } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
+
+        ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
+        cb(par, "ffn_moe_gate_par", il);
+
+        ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+        cb(experts, "ffn_moe_down", il);
+
+        experts = ggml_mul(ctx0, experts, weights);
+
+        // aggregate experts
+        ggml_tensor * moe_out = nullptr;
+        for (int i = 0; i < n_expert_used; ++i) {
+            ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
+                    experts->nb[2], i*experts->nb[1]);
+
+            if (i == 0) {
+                moe_out = cur_expert;
+            } else {
+                moe_out = ggml_add(ctx0, moe_out, cur_expert);
+            }
+        }
+
+        if (n_expert_used == 1) {
+            // avoid returning a non-contiguous tensor
+            moe_out = ggml_cont(ctx0, moe_out);
+        }
+
+        return moe_out;
+    }
+
+    struct ggml_tensor * build_attn(
+             struct ggml_cgraph * graph,
+             struct ggml_tensor * wo,
+             struct ggml_tensor * wo_b,
+             struct ggml_tensor * k_cur,
+             struct ggml_tensor * v_cur,
+             struct ggml_tensor * q_cur,
+                        int32_t   n_tokens,
+                        float     kq_scale,
+             const llm_build_cb & cb,
+                        int       il) {
+        // these nodes are added to the graph together so that they are not reordered
+        // by doing so, the number of splits in the graph is reduced
+        ggml_build_forward_expand(graph, q_cur);
+        ggml_build_forward_expand(graph, k_cur);
+        ggml_build_forward_expand(graph, v_cur);
+
+        //build_kv_store(graph, k_cur, v_cur, il);
+        lctx.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
+
+        struct ggml_tensor * cur;
+
+        //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
+        cur = lctx.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
+        cb(cur, "kqv_out", il);
+
+        return cur;
+    }
+
+    //struct ggml_tensor * build_rwkv6_time_mix(
+    //        const struct llama_layer * layer,
+    //        struct ggml_tensor * cur,
+    //        struct ggml_tensor * x_prev,
+    //        struct ggml_tensor ** wkv_state,
+    //        size_t wkv_head_size,
+    //        size_t head_count_kv) {
+    //    size_t n_embd       = cur->ne[0];
+    //    size_t n_seq_tokens = cur->ne[1];
+    //    size_t n_seqs       = cur->ne[2];
+
+    //    size_t head_size  = wkv_head_size;
+    //    size_t head_count = n_embd / head_size;
+
+    //    size_t n_tokens = n_seqs * n_seq_tokens;
+
+    //    bool is_qrwkv = layer->time_mix_first == nullptr;
+
+    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+
+    //    sx  = ggml_reshape_2d(ctx0, sx,  n_embd, n_tokens);
+    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+
+    //    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
+
+    //    xxx = ggml_reshape_4d(
+    //        ctx0,
+    //        ggml_tanh(
+    //            ctx0,
+    //            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
+    //        ),
+    //        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+    //    );
+
+    //    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
+
+    //    xxx = ggml_mul_mat(
+    //        ctx0,
+    //        ggml_reshape_4d(
+    //            ctx0,
+    //            layer->time_mix_w2,
+    //            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+    //        ),
+    //        xxx
+    //    );
+
+    //    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
+    //    if (layer->time_mix_lerp_fused) {
+    //        // fusing these weights makes some performance improvement
+    //        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
+    //        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+    //        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
+    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+    //    } else {
+    //        // for backward compatibility
+    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+
+    //        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
+    //        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
+    //        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
+    //        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
+    //        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
+    //    }
+
+    //    struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr);
+    //    struct ggml_tensor * k = build_lora_mm(layer->time_mix_key,        xk);
+    //    struct ggml_tensor * v = build_lora_mm(layer->time_mix_value,      xv);
+    //    if (layer->time_mix_receptance_b) {
+    //        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
+    //    }
+    //    if (layer->time_mix_key_b) {
+    //        k = ggml_add(ctx0, k, layer->time_mix_key_b);
+    //    }
+    //    if (layer->time_mix_value_b) {
+    //        v = ggml_add(ctx0, v, layer->time_mix_value_b);
+    //    }
+
+    //    struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg);
+    //    if (is_qrwkv) {
+    //        g = ggml_sigmoid(ctx0, g);
+    //    } else {
+    //        g = ggml_silu(ctx0, g);
+    //    }
+
+    //    if (head_count_kv != head_count) {
+    //        GGML_ASSERT(head_count % head_count_kv == 0);
+    //        k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens);
+    //        v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens);
+    //        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
+    //        k = ggml_repeat(ctx0, k, tmp);
+    //        v = ggml_repeat(ctx0, v, tmp);
+    //    }
+
+    //    k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
+    //    v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
+    //    r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
+
+    //    struct ggml_tensor * w = ggml_mul_mat(
+    //        ctx0,
+    //        layer->time_mix_decay_w2,
+    //        ggml_tanh(
+    //            ctx0,
+    //            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
+    //        )
+    //    );
+
+    //    w = ggml_add(ctx0, w, layer->time_mix_decay);
+    //    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
+    //    w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
+
+    //    if (is_qrwkv) {
+    //        // k = k * (1 - w)
+    //        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
+    //    }
+
+    //    struct ggml_tensor * wkv_output;
+    //    if (!layer->time_mix_first) {
+    //        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
+    //    } else {
+    //        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state);
+    //    }
+    //    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+    //    *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+    //    if (!is_qrwkv) {
+    //        // group norm with head_count groups
+    //        cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
+    //        cur = ggml_norm(ctx0, cur, 64e-5f);
+
+    //        // Convert back to regular vectors.
+    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+    //    } else {
+    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //    }
+
+    //    cur = ggml_mul(ctx0, cur, g);
+    //    cur = build_lora_mm(layer->time_mix_output, cur);
+
+    //    return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
+    //}
+
+    //struct ggml_tensor * build_rwkv6_channel_mix(
+    //    const struct llama_layer * layer,
+    //    struct ggml_tensor * cur,
+    //    struct ggml_tensor * x_prev) {
+    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+    //    struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+    //    struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
+
+    //    struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
+    //    struct ggml_tensor * k = ggml_sqr(
+    //            ctx0,
+    //            ggml_relu(
+    //                ctx0,
+    //                build_lora_mm(layer->channel_mix_key, xk)
+    //                )
+    //            );
+
+    //    return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
+    //}
+
     struct ggml_cgraph * build_k_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        GGML_ASSERT(kv_self.size == n_ctx);
-
-        lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-        cb(lctx.inp_K_shift, "K_shift", -1);
-        ggml_set_input(lctx.inp_K_shift);
-
-        for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head_kv    = hparams.n_head_kv(il);
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-
-            struct ggml_tensor * rope_factors = build_rope_factors(il);
-
-            struct ggml_tensor * k =
-                ggml_view_3d(ctx0, kv_self.k_l[il],
-                    n_embd_head_k, n_head_kv, n_ctx,
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                    0);
-
-            struct ggml_tensor * tmp;
-            if (ggml_is_quantized(k->type)) {
-                // dequantize to f32 -> RoPE -> quantize back
-                tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
-                cb(tmp, "K_f32", il);
-
-                for (auto & backend : lctx.backends) {
-                    // Figure out which backend KV cache belongs to
-                    if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
-                        ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get());
-                        break;
-                    }
-                }
-                tmp = ggml_rope_ext_inplace(ctx0, tmp,
-                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-                cb(tmp, "K_shifted_f32", il);
-
-                tmp = ggml_cpy(ctx0, tmp, k);
-            } else {
-                // we rotate only the first n_rot dimensions
-                tmp = ggml_rope_ext_inplace(ctx0, k,
-                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                        ext_factor, attn_factor, beta_fast, beta_slow);
-            }
-            cb(tmp, "K_shifted", il);
-
-            ggml_build_forward_expand(gf, tmp);
-        }
+        lctx.build_k_shift(ctx0, gf);
 
         return gf;
     }
 
-    struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
+    struct ggml_cgraph * build_defrag() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        for (uint32_t i = 0; i < ids.size(); ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == ids.size()) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            for (int il = 0; il < n_layer; ++il) {
-                const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-                const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-
-                ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
-
-                ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
-
-                ggml_tensor * view_v_src;
-                ggml_tensor * view_v_dst;
-
-                if (flash_attn) {
-                    // NOTE: the V cache is not transposed when using flash attention
-                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
-
-                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
-                } else {
-                    view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, i));
-
-                    view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
-                            ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
-                            ggml_row_size(kv_self.v_l[il]->type, id));
-                }
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
-            }
-
-            i += nm - 1;
-        }
-
-        //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
+        lctx.build_defrag(ctx0, gf);
 
         return gf;
     }
@@ -1294,21 +823,6 @@ struct llm_build_context {
         return lctx.inp_pos;
     }
 
-    struct ggml_tensor * build_rope_factors(int il) {
-        // choose long/short freq factors based on the context size
-        const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
-
-        if (model.layers[il].rope_freqs != nullptr) {
-            return model.layers[il].rope_freqs;
-        }
-
-        if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
-            return model.layers[il].rope_long;
-        }
-
-        return model.layers[il].rope_short;
-    }
-
     struct ggml_tensor * build_inp_out_ids() {
         lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
         cb(lctx.inp_out_ids, "inp_out_ids", -1);
@@ -1316,28 +830,6 @@ struct llm_build_context {
         return lctx.inp_out_ids;
     }
 
-    struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
-        lctx.inp_KQ_mask = causal
-            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
-            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        cb(lctx.inp_KQ_mask, "KQ_mask", -1);
-        ggml_set_input(lctx.inp_KQ_mask);
-
-        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
-    }
-
-    struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) {
-        GGML_ASSERT(hparams.n_swa > 0);
-
-        lctx.inp_KQ_mask_swa = causal
-            ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv,     GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
-            : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-        cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
-        ggml_set_input(lctx.inp_KQ_mask_swa);
-
-        return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
-    }
-
     struct ggml_tensor * build_inp_mean() {
         lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
         cb(lctx.inp_mean, "inp_mean", -1);
@@ -1352,20 +844,6 @@ struct llm_build_context {
         return lctx.inp_cls;
     }
 
-    struct ggml_tensor * build_inp_s_copy() {
-        lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
-        cb(lctx.inp_s_copy, "inp_s_copy", -1);
-        ggml_set_input(lctx.inp_s_copy);
-        return lctx.inp_s_copy;
-    }
-
-    struct ggml_tensor * build_inp_s_mask() {
-        lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
-        cb(lctx.inp_s_mask, "inp_s_mask", -1);
-        ggml_set_input(lctx.inp_s_mask);
-        return lctx.inp_s_mask;
-    }
-
     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
         // find result_norm tensor for input
         struct ggml_tensor * inp = nullptr;
@@ -1431,37 +909,37 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_tensor * build_pos_bucket(bool causal) {
-        if (causal) {
-            lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
-        } else {
-            lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
-        }
+    //struct ggml_tensor * build_pos_bucket(bool causal) {
+    //    if (causal) {
+    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv,     n_tokens);
+    //    } else {
+    //        lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+    //    }
 
-        ggml_set_input(lctx.inp_pos_bucket);
-        cb(lctx.inp_pos_bucket, "pos_bucket", -1);
+    //    ggml_set_input(lctx.inp_pos_bucket);
+    //    cb(lctx.inp_pos_bucket, "pos_bucket", -1);
 
-        return lctx.inp_pos_bucket;
-    }
+    //    return lctx.inp_pos_bucket;
+    //}
 
-    struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
-        struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
-        cb(pos_bucket_1d, "pos_bucket_1d", -1);
+    //struct ggml_tensor * build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
+    //    struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
+    //    cb(pos_bucket_1d, "pos_bucket_1d", -1);
 
-        struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
-        cb(pos_bias, "pos_bias", -1);
+    //    struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
-        cb(pos_bias, "pos_bias", -1);
+    //    pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0],  0);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
-        cb(pos_bias, "pos_bias", -1);
+    //    pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        pos_bias = ggml_cont(ctx0, pos_bias);
-        cb(pos_bias, "pos_bias", -1);
+    //    pos_bias = ggml_cont(ctx0, pos_bias);
+    //    cb(pos_bias, "pos_bias", -1);
 
-        return pos_bias;
-    }
+    //    return pos_bias;
+    //}
 
     struct ggml_tensor * build_inp_embd_enc() {
         const int64_t n_embd = hparams.n_embd;
@@ -1491,45 +969,44 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -1550,9 +1027,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1574,12 +1051,12 @@ struct llm_build_context {
             // feed-forward network
             if (model.layers[il].ffn_gate_inp == nullptr) {
 
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -1588,12 +1065,12 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             } else {
                 // MoE branch
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                cur = build_moe_ffn(cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -1615,7 +1092,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1624,13 +1101,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // For Granite architecture
         if (hparams.f_logit_scale) {
@@ -1657,13 +1134,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -1676,37 +1152,37 @@ struct llm_build_context {
                 cur = inpL;
             } else {
                 // norm
-                cur = llm_build_norm(ctx0, inpL, hparams,
+                cur = build_norm(inpL,
                         model.layers[il].attn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "attn_norm", il);
             }
 
             if (n_head > 0 && n_head_kv == 0) {
                 // "linear attention" of Llama-3_1-Nemotron-51B
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                cur = build_lora_mm(model.layers[il].wo, cur);
                 cb(cur, "wo", il);
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -1727,9 +1203,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1754,12 +1230,12 @@ struct llm_build_context {
 
             // feed-forward network
             if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -1776,7 +1252,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1785,13 +1261,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // For Granite architecture
         if (hparams.f_logit_scale) {
@@ -1815,31 +1291,30 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 switch (model.type) {
@@ -1865,9 +1340,9 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1882,12 +1357,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -1897,7 +1372,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -1906,13 +1381,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -1930,31 +1405,30 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -1970,9 +1444,9 @@ struct llm_build_context {
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -1987,12 +1461,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2002,7 +1476,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2011,11 +1485,11 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2034,37 +1508,36 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
 
-            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+            attn_norm = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(attn_norm, "attn_norm", il);
 
             // self-attention
             {
                 if (model.layers[il].attn_norm_2) {
                     // Falcon-40B
-                    cur = llm_build_norm(ctx0, inpL, hparams,
+                    cur = build_norm(inpL,
                             model.layers[il].attn_norm_2,
                             model.layers[il].attn_norm_2_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(cur, "attn_norm_2", il);
                 } else {
                     cur = attn_norm;
                 }
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -2091,9 +1564,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2108,7 +1581,7 @@ struct llm_build_context {
 
             // feed forward
             {
-                cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result
+                cur = build_ffn(attn_norm, // !! use the attn norm, not the result
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2119,7 +1592,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2129,13 +1602,13 @@ struct llm_build_context {
         cur = inpL;
 
         // norm
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2156,7 +1629,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // multiply by embedding_multiplier_scale of 78.38367176906169
         inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
@@ -2164,37 +1637,36 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -2215,9 +1687,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2231,9 +1703,9 @@ struct llm_build_context {
             // Grok
             // if attn_out_norm is present then apply it before adding the input
             if (model.layers[il].attn_out_norm) {
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].attn_out_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "attn_out_norm", il);
             }
 
@@ -2242,12 +1714,12 @@ struct llm_build_context {
 
             // feed-forward network
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -2264,16 +1736,16 @@ struct llm_build_context {
             // if layer_out_norm is present then apply it before adding the input
             // Idea: maybe ffn_out_norm is a better name
             if (model.layers[il].layer_out_norm) {
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].layer_out_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "layer_out_norm", il);
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2282,13 +1754,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // Grok
         // multiply logits by output_multiplier_scale of 0.5773502691896257
@@ -2316,21 +1788,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                                 model.layers[il].attn_norm, NULL,
-                                 LLM_NORM, cb, il);
+            cur = build_norm(inpL,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
@@ -2339,7 +1810,7 @@ struct llm_build_context {
                 struct ggml_tensor * Kcur = nullptr;
                 struct ggml_tensor * Vcur = nullptr;
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -2367,9 +1838,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2385,12 +1856,12 @@ struct llm_build_context {
 
             // feed-forward network
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                                 model.layers[il].attn_out_norm, NULL,
-                                 LLM_NORM, cb, il);
+            cur = build_norm(ffn_inp,
+                    model.layers[il].attn_out_norm, NULL,
+                    LLM_NORM, il);
             cb(cur, "attn_out_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -2406,7 +1877,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2415,13 +1886,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
-                             model.output_norm, NULL,
-                             LLM_NORM, cb, -1);
+        cur = build_norm(cur,
+                model.output_norm, NULL,
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -2440,13 +1911,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -2455,15 +1925,15 @@ struct llm_build_context {
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -2479,9 +1949,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2497,13 +1967,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -2513,20 +1983,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2543,28 +2013,27 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
@@ -2573,9 +2042,9 @@ struct llm_build_context {
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 cb(Qcur, "Qcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2590,12 +2059,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2605,7 +2074,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -2614,13 +2083,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2645,7 +2114,7 @@ struct llm_build_context {
         }
 
         // construct input embeddings (token, type, position)
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // token types are hardcoded to zero ("Sentence A")
         struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
@@ -2656,11 +2125,10 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         // embed layer norm
-        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
+        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
+        lctx.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -2672,33 +2140,33 @@ struct llm_build_context {
 
             // self-attention
             if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
-                Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
+                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
                 if (model.layers[il].attn_q_norm) {
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                             model.layers[il].attn_q_norm,
                             model.layers[il].attn_q_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                 }
 
-                Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
+                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
                 if (model.layers[il].attn_k_norm) {
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             model.layers[il].attn_k_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                 }
-                Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
+                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
             } else {
                 // compute Q and K and RoPE them
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
@@ -2730,7 +2198,8 @@ struct llm_build_context {
             struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
             cb(kq, "kq", il);
 
-            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+            //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
+            kq = lctx.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
             cb(kq, "kq_soft_max_ext", il);
 
             struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@@ -2747,7 +2216,7 @@ struct llm_build_context {
 
             ggml_build_forward_expand(gf, cur);
 
-            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+            cur = build_lora_mm(model.layers[il].wo, cur);
             if (model.layers[il].bo) {
                 cb(cur, "kqv_wo", il);
             }
@@ -2768,11 +2237,11 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, inpL);
 
             // attention layer norm
-            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
+            cur = build_norm(cur, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, il);
 
             if (model.layers[il].attn_norm_2 != nullptr) {
                 cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
-                cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
+                cur = build_norm(cur, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, il);
             }
 
             struct ggml_tensor * ffn_inp = cur;
@@ -2780,21 +2249,21 @@ struct llm_build_context {
 
             // feed-forward network
             if (model.arch == LLM_ARCH_BERT) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
             } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL,                        NULL,
                         model.layers[il].ffn_gate, NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
             } else {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -2807,7 +2276,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
 
             // output layer norm
-            cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
+            cur = build_norm(cur, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, il);
 
             // input for next layer
             inpL = cur;
@@ -2832,27 +2301,26 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
-        inpL = llm_build_norm(ctx0, inpL, hparams,
+        inpL = build_norm(inpL,
                 model.tok_norm,
                 model.tok_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -2868,9 +2336,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -2886,13 +2354,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -2902,20 +2370,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -2934,10 +2402,9 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -2952,17 +2419,17 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
 
-            attn_norm = llm_build_norm(ctx0, inpL, hparams,
+            attn_norm = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(attn_norm, "attn_norm", il);
 
             // self-attention
             {
                 cur = attn_norm;
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 if (model.layers[il].bqkv){
@@ -2985,30 +2452,30 @@ struct llm_build_context {
 
                 // Q/K Layernorm
                 if (model.layers[il].attn_q_norm) {
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                             model.layers[il].attn_q_norm,
                             model.layers[il].attn_q_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
 
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             model.layers[il].attn_k_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
 
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
-                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                    cur = build_attn(gf,
                             model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 } else {
                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                    cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                    cur = build_attn(gf,
                             model.layers[il].wo, model.layers[il].bo,
-                            Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                            Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                 }
             }
 
@@ -3025,12 +2492,12 @@ struct llm_build_context {
 
             // feed forward
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -3040,7 +2507,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3049,13 +2516,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3072,22 +2539,21 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             struct ggml_tensor * inpSA = cur;
@@ -3095,21 +2561,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -3122,17 +2588,17 @@ struct llm_build_context {
                 cb(Kcur, "Kcur", il);
 
                 if (model.layers[il].attn_q_norm) {
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                             model.layers[il].attn_q_norm,
                             NULL,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
                 }
                 if (model.layers[il].attn_k_norm) {
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             NULL,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
                 }
 
@@ -3151,9 +2617,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3170,16 +2636,16 @@ struct llm_build_context {
             // feed-forward network
             {
                 if (model.layers[il].ffn_norm) {
-                    cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    cur = build_norm(ffn_inp,
                             model.layers[il].ffn_norm,
                             model.layers[il].ffn_norm_b,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(cur, "ffn_norm", il);
                 } else {
                     // parallel residual
                     cur = inpSA;
                 }
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -3189,7 +2655,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3198,14 +2664,14 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3222,25 +2688,24 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -3270,9 +2735,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3287,12 +2752,12 @@ struct llm_build_context {
 
             // feed-forward forward
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -3302,7 +2767,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3311,13 +2776,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3335,37 +2800,36 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -3384,9 +2848,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3400,12 +2864,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -3414,7 +2878,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3423,13 +2887,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3446,7 +2910,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
@@ -3454,8 +2918,8 @@ struct llm_build_context {
         ggml_set_input(lctx.inp_pos);
         struct ggml_tensor * inp_pos = lctx.inp_pos;
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
 
@@ -3463,25 +2927,25 @@ struct llm_build_context {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -3502,9 +2966,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3518,12 +2982,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -3532,7 +2996,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3541,13 +3005,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3568,37 +3032,36 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                 cb(Vcur, "Vcur", il);
@@ -3617,9 +3080,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3634,13 +3097,13 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             ggml_tensor * moe_out =
-                    llm_build_moe_ffn(ctx0, lctx, cur,
+                    build_moe_ffn(cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -3655,14 +3118,14 @@ struct llm_build_context {
 
             // FFN shared expert
             {
-                ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
+                ggml_tensor * cur_gate_inp = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
                 cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
 
                 // sigmoid
                 ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
                 cb(cur_gate, "ffn_shexp_gate", il);
 
-                ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur,
+                ggml_tensor * cur_ffn = build_ffn(cur,
                         model.layers[il].ffn_up_shexp,   NULL, NULL,
                         model.layers[il].ffn_gate_shexp, NULL, NULL,
                         model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -3680,7 +3143,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -3689,13 +3152,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -3715,19 +3178,18 @@ struct llm_build_context {
         struct ggml_tensor * ffn_output;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
+            attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(attn_norm_output, "attn_norm", il);
 
             // self-attention
@@ -3737,7 +3199,7 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = nullptr;
 
                 if (model.layers[il].wqkv) {
-                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
                     cb(cur, "wqkv", il);
 
                     cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -3747,9 +3209,9 @@ struct llm_build_context {
                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
                 } else {
-                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
                 }
 
                 cb(Qcur, "Qcur", il);
@@ -3776,9 +3238,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3791,7 +3253,7 @@ struct llm_build_context {
 
             // FF
             {
-                ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output,
+                ffn_output = build_ffn(attn_norm_output,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -3802,20 +3264,20 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, ffn_output);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output_no_bias", -1);
 
         cur = ggml_add(ctx0, cur, model.output_b);
@@ -3834,19 +3296,13 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = nullptr;
-        if (hparams.n_swa == 0) {
-            // Phi-4 doesn't use sliding window attention
-            KQ_mask = build_inp_KQ_mask();
-        } else {
-            KQ_mask = build_inp_KQ_mask_swa();
-        }
+        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             auto residual = inpL;
@@ -3854,12 +3310,12 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
-                struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
+                struct ggml_tensor* attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
                 cb(attn_norm_output, "attn_norm", il);
 
                 struct ggml_tensor * Qcur = nullptr;
@@ -3867,16 +3323,16 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = nullptr;
 
                 if (model.layers[il].wqkv) {
-                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
+                    cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
                     cb(cur, "wqkv", il);
 
                     Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
                 } else {
-                    Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
-                    Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
-                    Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
+                    Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
+                    Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
+                    Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
                 }
 
                 cb(Qcur, "Qcur", il);
@@ -3901,9 +3357,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -3916,14 +3372,14 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, residual);
             residual = cur;
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                 model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                LLM_NORM_RMS, cb, il);
+                LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             // feed-forward network
             if (model.layers[il].ffn_gate_inp == nullptr) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -3932,7 +3388,7 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             } else {
                 // MoE branch
-                cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                cur = build_moe_ffn(cur,
                         model.layers[il].ffn_gate_inp,
                         model.layers[il].ffn_up_exps,
                         model.layers[il].ffn_gate_exps,
@@ -3947,20 +3403,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, residual, cur);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
             model.output_norm,
             model.output_norm_b,
-            LLM_NORM_RMS, cb, -1);
+            LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         if (model.output_b != nullptr) {
             cb(cur, "result_output_no_bias", -1);
@@ -3984,20 +3440,19 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             struct ggml_tensor * attention_norm = cur;
@@ -4005,13 +3460,13 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -4026,9 +3481,9 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
             struct ggml_tensor * sa_out = cur;
 
@@ -4044,7 +3499,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4055,7 +3510,7 @@ struct llm_build_context {
 
             cur = ggml_add(ctx0, cur, sa_out);
             cur = ggml_add(ctx0, cur, inpL);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4064,13 +3519,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4089,13 +3544,12 @@ struct llm_build_context {
         struct ggml_tensor * pos;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -4104,15 +3558,15 @@ struct llm_build_context {
         cb(inpL, "inpL", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -4128,9 +3582,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4146,13 +3600,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -4162,20 +3616,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4194,24 +3648,23 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -4239,9 +3692,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4257,13 +3710,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -4273,20 +3726,20 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
             inpL = cur;
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4304,41 +3757,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 // if (model.layers[il].bq) {
                 //     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                 //     cb(Qcur, "Qcur", il);
                 // }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 // if (model.layers[il].bk) {
                 //     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                 //     cb(Kcur, "Kcur", il);
                 // }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 // if (model.layers[il].bv) {
                 //     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -4359,9 +3811,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4375,12 +3827,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -4389,7 +3841,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4398,13 +3850,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4422,41 +3874,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -4477,9 +3928,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4493,12 +3944,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -4507,7 +3958,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4516,13 +3967,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4546,7 +3997,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // scale the input embeddings
         inpL = ggml_scale(ctx0, inpL, scale_embd);
@@ -4555,17 +4006,16 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            struct ggml_tensor * rope_factors = build_rope_factors(il);
+            struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
@@ -4575,9 +4025,9 @@ struct llm_build_context {
                 q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                 cb(q, "q", il);
 
-                q = llm_build_norm(ctx0, q, hparams,
+                q = build_norm(q,
                         model.layers[il].attn_q_a_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(q, "q", il);
 
                 // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
@@ -4616,9 +4066,9 @@ struct llm_build_context {
                 cb(k_pe, "k_pe", il);
 
                 kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
-                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                kv_compressed = build_norm(kv_compressed,
                         model.layers[il].attn_kv_a_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(kv_compressed, "kv_compressed", il);
 
                 // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
@@ -4670,9 +4120,9 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        k_states, v_states, q_states, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4692,12 +4142,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4711,7 +4161,7 @@ struct llm_build_context {
             cb(cur, "hidden_scaled_ffn", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4720,9 +4170,9 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head scaling
@@ -4731,7 +4181,7 @@ struct llm_build_context {
         cb(cur, "lmhead_scaling", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4747,7 +4197,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
         cb(inpL, "inp_scaled", -1);
@@ -4755,26 +4205,25 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -4792,9 +4241,9 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -4807,14 +4256,14 @@ struct llm_build_context {
             struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
             cb(sa_out, "sa_out", il);
 
-            cur = llm_build_norm(ctx0, sa_out, hparams,
+            cur = build_norm(sa_out,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4824,7 +4273,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, sa_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4833,13 +4282,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -4855,7 +4304,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
         cb(inpL, "inp_scaled", -1);
@@ -4863,31 +4312,25 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        // gemma 2 requires different mask for layers using sliding window (SWA)
-        struct ggml_tensor * KQ_mask     = build_inp_KQ_mask(true);
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
+        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            // (il % 2) layers use SWA
-            struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
-
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -4911,14 +4354,14 @@ struct llm_build_context {
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f, cb, il);
             }
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                     model.layers[il].attn_post_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_post_norm", il);
 
             if (il == n_layer - 1) {
@@ -4931,14 +4374,14 @@ struct llm_build_context {
             struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
             cb(sa_out, "sa_out", il);
 
-            cur = llm_build_norm(ctx0, sa_out, hparams,
+            cur = build_norm(sa_out,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -4947,13 +4390,13 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
             }
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                 model.layers[il].ffn_post_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
             cb(cur, "ffn_post_norm", -1);
 
             cur = ggml_add(ctx0, cur, sa_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -4962,13 +4405,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         // final logit soft-capping
         cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
@@ -4993,41 +4436,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -5048,9 +4490,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5065,12 +4507,12 @@ struct llm_build_context {
 
             // feed-forward network
 
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -5079,7 +4521,7 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5088,13 +4530,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5109,21 +4551,20 @@ struct llm_build_context {
         struct ggml_tensor * inpL;
 
         // {n_embd, n_tokens}
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = build_inp_s_copy();
-        struct ggml_tensor * state_mask = build_inp_s_mask();
+        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
-            cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur,
-                    state_copy, state_mask,
-                    kv_head, n_kv, cb, il);
+            //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
+            cur = lctx.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
 
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
@@ -5142,13 +4583,13 @@ struct llm_build_context {
         }
 
         // final rmsnorm
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5167,41 +4608,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
             struct ggml_tensor * ffn_inp = cur;
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -5220,16 +4660,16 @@ struct llm_build_context {
                                 0);
                     cb(Kcur, "Kcur", il);
 
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                                 model.layers[il].attn_q_norm,
                                 NULL,
-                                LLM_NORM, cb, il);
+                                LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
 
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                             model.layers[il].attn_k_norm,
                             NULL,
-                            LLM_NORM, cb, il);
+                            LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
                 }
 
@@ -5247,9 +4687,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5264,7 +4704,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+                cur = build_ffn(ffn_inp,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -5276,7 +4716,7 @@ struct llm_build_context {
             // add together residual + FFN + self-attention
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5285,13 +4725,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         if (f_logit_scale) {
             cur = ggml_scale(ctx0, cur, f_logit_scale);
@@ -5315,15 +4755,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        // cohere2 requires different mask for layers using sliding window (SWA)
-        struct ggml_tensor * KQ_mask     = build_inp_KQ_mask();
-        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
+        lctx.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -5331,35 +4768,34 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             // three layers sliding window attention (window size 4096) and ROPE
             // fourth layer uses global attention without positional embeddings
-            const bool           is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
-            struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
+            const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
+            cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
             cb(cur, "attn_norm", il);
             struct ggml_tensor * ffn_inp = cur;
 
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -5385,8 +4821,8 @@ struct llm_build_context {
                     cb(Kcur, "Kcur", il);
                 }
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
-                                   KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
+                cur = build_attn(gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
+                                   n_tokens, 1.0f / sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5401,7 +4837,7 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
+                cur = build_ffn(ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
                                     NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
                                     cb, il);
                 cb(cur, "ffn_out", il);
@@ -5410,7 +4846,7 @@ struct llm_build_context {
             // add together residual + FFN + self-attention
             cur = ggml_add(ctx0, cur, inpL);
             cur = ggml_add(ctx0, cur, attn_out);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5419,11 +4855,11 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
+        cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         if (f_logit_scale) {
             cur = ggml_scale(ctx0, cur, f_logit_scale);
@@ -5455,41 +4891,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     NULL, NULL,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (hparams.f_clamp_kqv > 0.0f) {
                     Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -5510,9 +4945,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5527,12 +4962,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     NULL, NULL,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -5543,7 +4978,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5552,13 +4987,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 NULL, NULL,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5579,13 +5014,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5595,21 +5029,21 @@ struct llm_build_context {
             // self_attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
-                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Qcur, "Qcur_normed", il);
 
-                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur_normed", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -5629,14 +5063,14 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                     model.layers[il].attn_post_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_post_norm", il);
 
             if (il == n_layer - 1) {
@@ -5651,7 +5085,7 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+            cur = build_ffn(ffn_inp,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -5659,15 +5093,15 @@ struct llm_build_context {
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_out", il);
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                 model.layers[il].ffn_post_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
             cb(cur, "ffn_post_norm", -1);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5676,13 +5110,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5707,41 +5141,40 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
-                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Qcur, "Qcur_normed", il);
 
-                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur_normed", il);
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@@ -5761,9 +5194,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5778,12 +5211,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // MoE branch
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -5797,7 +5230,7 @@ struct llm_build_context {
             cb(cur, "ffn_moe_out", il);
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -5806,13 +5239,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5828,13 +5261,12 @@ struct llm_build_context {
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -5845,14 +5277,14 @@ struct llm_build_context {
             struct ggml_tensor * residual = cur;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
@@ -5866,14 +5298,14 @@ struct llm_build_context {
                 struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
                 cb(Vcur, "Vcur", il);
 
-                Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                Qcur = build_norm(Qcur,
                         model.layers[il].attn_q_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(Qcur, "Qcur", il);
 
-                Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                Kcur = build_norm(Kcur,
                         model.layers[il].attn_k_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -5891,9 +5323,9 @@ struct llm_build_context {
                 Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
                 cb(Qcur, "Vcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -5908,12 +5340,12 @@ struct llm_build_context {
 
             // feed-forward network
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -5923,7 +5355,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             inpL = cur;
@@ -5932,12 +5364,12 @@ struct llm_build_context {
         cur = inpL;
 
         // norm
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -5955,24 +5387,23 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -6000,9 +5431,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6019,13 +5450,13 @@ struct llm_build_context {
 
                 struct ggml_tensor * attn_out = cur;
 
-                cur = llm_build_norm(ctx0, inpL, hparams,
+                cur = build_norm(inpL,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -6037,7 +5468,7 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
 
                 cur = ggml_add(ctx0, cur, attn_out);
-                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cur = cvec.apply_to(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -6050,13 +5481,13 @@ struct llm_build_context {
                 struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
                 cb(ffn_inp, "ffn_inp", il);
 
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -6065,7 +5496,7 @@ struct llm_build_context {
                 cb(cur, "ffn_out", il);
 
                 cur = ggml_add(ctx0, cur, ffn_inp);
-                cur = lctx.cvec.apply_to(ctx0, cur, il);
+                cur = cvec.apply_to(ctx0, cur, il);
                 cb(cur, "l_out", il);
 
                 // input for next layer
@@ -6073,13 +5504,13 @@ struct llm_build_context {
             }
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -6100,33 +5531,32 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 Qcur = ggml_rope_ext(
@@ -6143,9 +5573,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6160,12 +5590,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -6177,12 +5607,12 @@ struct llm_build_context {
             cb(ffn_out, "ffn_out", il);
 
             // MoE
-            cur = llm_build_norm(ctx0, inpSA, hparams,
+            cur = build_norm(inpSA,
                     model.layers[il].ffn_norm_exps, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm_exps", il);
 
-            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+            cur = build_moe_ffn(cur,
                     model.layers[il].ffn_gate_inp,
                     model.layers[il].ffn_up_exps,
                     model.layers[il].ffn_gate_exps,
@@ -6198,7 +5628,7 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, ffn_out);
             cb(cur, "ffn_out", il);
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6207,13 +5637,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -6234,44 +5664,45 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -6292,9 +5723,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6309,13 +5740,13 @@ struct llm_build_context {
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             if ((uint32_t) il < hparams.n_layer_dense_lead) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -6325,7 +5756,7 @@ struct llm_build_context {
             } else {
                 // MoE branch
                 ggml_tensor * moe_out =
-                        llm_build_moe_ffn(ctx0, lctx, cur,
+                        build_moe_ffn(cur,
                             model.layers[il].ffn_gate_inp,
                             model.layers[il].ffn_up_exps,
                             model.layers[il].ffn_gate_exps,
@@ -6340,7 +5771,7 @@ struct llm_build_context {
 
                 // FFN shared expert
                 {
-                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                    ggml_tensor * ffn_shexp = build_ffn(cur,
                             model.layers[il].ffn_up_shexp,   NULL, NULL,
                             model.layers[il].ffn_gate_shexp, NULL, NULL,
                             model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -6354,7 +5785,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6363,13 +5794,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -6400,21 +5831,20 @@ struct llm_build_context {
         struct ggml_tensor * inpL;
 
         // {n_embd, n_tokens}
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self_attention
@@ -6425,9 +5855,9 @@ struct llm_build_context {
                     q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                     cb(q, "q", il);
 
-                    q = llm_build_norm(ctx0, q, hparams,
+                    q = build_norm(q,
                             model.layers[il].attn_q_a_norm, NULL,
-                            LLM_NORM_RMS, cb, il);
+                            LLM_NORM_RMS, il);
                     cb(q, "q", il);
 
                     // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
@@ -6470,9 +5900,9 @@ struct llm_build_context {
                 cb(k_pe, "k_pe", il);
 
                 kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
-                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                kv_compressed = build_norm(kv_compressed,
                         model.layers[il].attn_kv_a_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(kv_compressed, "kv_compressed", il);
 
                 // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
@@ -6524,9 +5954,9 @@ struct llm_build_context {
                 struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                 cb(k_states, "k_states", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+                        k_states, v_states, q_states, n_tokens, kq_scale, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -6540,13 +5970,13 @@ struct llm_build_context {
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
             if ((uint32_t) il < hparams.n_layer_dense_lead) {
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         model.layers[il].ffn_gate, NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -6556,7 +5986,7 @@ struct llm_build_context {
             } else {
                 // MoE branch
                 ggml_tensor * moe_out =
-                        llm_build_moe_ffn(ctx0, lctx, cur,
+                        build_moe_ffn(cur,
                             model.layers[il].ffn_gate_inp,
                             model.layers[il].ffn_up_exps,
                             model.layers[il].ffn_gate_exps,
@@ -6571,7 +6001,7 @@ struct llm_build_context {
 
                 // FFN shared expert
                 {
-                    ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
+                    ggml_tensor * ffn_shexp = build_ffn(cur,
                             model.layers[il].ffn_up_shexp,   NULL, NULL,
                             model.layers[il].ffn_gate_shexp, NULL, NULL,
                             model.layers[il].ffn_down_shexp, NULL, NULL,
@@ -6585,7 +6015,7 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cur = cvec.apply_to(ctx0, cur, il);
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -6594,9 +6024,9 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
@@ -6617,26 +6047,25 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 if (model.layers[il].wq_scale) {
                     Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
                 }
@@ -6647,7 +6076,7 @@ struct llm_build_context {
                 }
 
                 // B1.K
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 if (model.layers[il].wk_scale) {
                     Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
                 }
@@ -6658,7 +6087,7 @@ struct llm_build_context {
                 }
 
                 // B1.V
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 if (model.layers[il].wv_scale) {
                     Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
                 }
@@ -6682,16 +6111,16 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         NULL, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].attn_sub_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "attn_sub_norm", il);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+                cur = build_lora_mm(model.layers[il].wo, cur);
                 if (model.layers[il].wo_scale) {
                     cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
                 }
@@ -6712,12 +6141,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward forward
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, model.layers[il].ffn_up_scale,
                     model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
                     NULL,                      NULL, NULL,
@@ -6725,12 +6154,12 @@ struct llm_build_context {
                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
             cb(cur, "ffn_sub_out", il);
 
-            cur = llm_build_norm(ctx0, cur, hparams,
-                            model.layers[il].ffn_sub_norm, NULL,
-                            LLM_NORM_RMS, cb, il);
+            cur = build_norm(cur,
+                    model.layers[il].ffn_sub_norm, NULL,
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_sub_norm", il);
 
-            cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
+            cur = build_lora_mm(model.layers[il].ffn_down, cur);
             if (model.layers[il].ffn_down_scale) {
                 cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
             }
@@ -6745,356 +6174,356 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
         // FIXME: do not use model.tok_embd directly, duplicate as model.output
-        cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
+        cur = build_lora_mm(model.tok_embd, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
         return gf;
     }
 
-    struct ggml_cgraph * build_t5_enc() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    //struct ggml_cgraph * build_t5_enc() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
+    //    // mutable variable, needed during the last layer of the computation to skip unused tokens
+    //    int32_t n_tokens = this->n_tokens;
 
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    //    const int64_t n_embd_head = hparams.n_embd_head_v;
+    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+    //    inpL = build_inp_embd(model.tok_embd);
 
-        GGML_ASSERT(lctx.is_encoding);
-        struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
+    //    GGML_ASSERT(lctx.is_encoding);
+    //    struct ggml_tensor * pos_bucket_enc = build_pos_bucket(false);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
+    //    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    //    struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
 
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        struct ggml_tensor * inpSA = inpL;
 
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm_enc, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
+    //        // norm
+    //        cur = build_norm(inpL,
+    //                model.layers[il].attn_norm_enc, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm", il);
 
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
-                cb(Qcur, "Qcur", il);
+    //        // self-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_enc, cur);
+    //            cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
-                cb(Kcur, "Kcur", il);
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_enc, cur);
+    //            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
-                cb(Vcur, "Vcur", il);
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_enc, cur);
+    //            cb(Vcur, "Vcur", il);
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
-                struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-                struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-                cb(kq, "kq", il);
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
 
-                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
-                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
-                struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-                cb(kq_b, "kq_b", il);
+    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_enc, attn_rel_b);
+    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+    //            cb(kq_b, "kq_b", il);
 
-                kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
-                cb(kq, "kq_soft_max_ext", il);
+    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
 
-                struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
-                cb(v, "v", il);
+    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+    //            cb(v, "v", il);
 
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
-                cb(kqv, "kqv", il);
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+    //            cb(kqv, "kqv", il);
 
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
 
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
 
-                ggml_build_forward_expand(gf, cur);
+    //            ggml_build_forward_expand(gf, cur);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
-                cb(cur, "kqv_out", il);
-            }
+    //            cur = build_lora_mm(model.layers[il].wo_enc, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-            }
+    //        if (il == n_layer - 1) {
+    //            // skip computing output for unused tokens
+    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //            n_tokens = n_outputs;
+    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+    //        }
 
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-            cb(ffn_inp, "ffn_inp", il);
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+    //        cb(ffn_inp, "ffn_inp", il);
 
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm_enc, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
+    //        // feed-forward network
+    //        {
+    //            cur = build_norm(ffn_inp,
+    //                    model.layers[il].ffn_norm_enc, NULL,
+    //                    LLM_NORM_RMS, il);
+    //            cb(cur, "ffn_norm", il);
 
-                // T5 uses relu, flan-T5 uses gelu-gated
-                cur = llm_build_ffn(ctx0, lctx, cur,
-                        model.layers[il].ffn_up_enc,   NULL, NULL,
-                        model.layers[il].ffn_gate_enc, NULL, NULL,
-                        model.layers[il].ffn_down_enc, NULL, NULL,
-                        NULL,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
-                        cb, il);
-                cb(cur, "ffn_out", il);
-            }
+    //            // T5 uses relu, flan-T5 uses gelu-gated
+    //            cur = build_ffn(cur,
+    //                    model.layers[il].ffn_up_enc,   NULL, NULL,
+    //                    model.layers[il].ffn_gate_enc, NULL, NULL,
+    //                    model.layers[il].ffn_down_enc, NULL, NULL,
+    //                    NULL,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR  : LLM_FFN_SEQ,
+    //                    cb, il);
+    //            cb(cur, "ffn_out", il);
+    //        }
 
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cb(cur, "ffn_out", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
-            cb(cur, "l_out", il);
+    //        ggml_tensor * layer_dir = cvec.tensor_for(il);
+    //        if (layer_dir != nullptr) {
+    //            cur = ggml_add(ctx0, cur, layer_dir);
+    //        }
+    //        cb(cur, "l_out", il);
 
-            // input for next layer
-            inpL = cur;
-        }
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
 
-        cur = inpL;
-        cb(cur, "result_embd", -1);
+    //    cur = inpL;
+    //    cb(cur, "result_embd", -1);
 
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm_enc, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
+    //    cur = build_norm(cur,
+    //            model.output_norm_enc, NULL,
+    //            LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
 
-        ggml_build_forward_expand(gf, cur);
+    //    ggml_build_forward_expand(gf, cur);
 
-        return gf;
-    }
+    //    return gf;
+    //}
 
-    struct ggml_cgraph * build_t5_dec() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    //struct ggml_cgraph * build_t5_dec() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
+    //    // mutable variable, needed during the last layer of the computation to skip unused tokens
+    //    int32_t n_tokens = this->n_tokens;
 
-        const int64_t n_embd_head = hparams.n_embd_head_v;
-        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    //    const int64_t n_embd_head = hparams.n_embd_head_v;
+    //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+    //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+    //    inpL = build_inp_embd(model.tok_embd);
 
-        GGML_ASSERT(!lctx.is_encoding);
-        GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+    //    GGML_ASSERT(!lctx.is_encoding);
+    //    GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
 
-        struct ggml_tensor * embd_enc       = build_inp_embd_enc();
-        struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
+    //    struct ggml_tensor * embd_enc       = build_inp_embd_enc();
+    //    struct ggml_tensor * pos_bucket_dec = build_pos_bucket(true);
 
-        struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
-        struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
+    //    struct ggml_tensor * KQ_mask_dec   = build_inp_KQ_mask();
+    //    struct ggml_tensor * KQ_mask_cross = build_inp_KQ_mask_cross();
 
-        for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        struct ggml_tensor * inpSA = inpL;
 
-            // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
-                    model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm", il);
+    //        // norm
+    //        cur = build_norm(inpL,
+    //                model.layers[il].attn_norm, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm", il);
 
-            // self-attention
-            {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
+    //        // self-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+    //            cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+    //            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+    //            cb(Vcur, "Vcur", il);
 
-                llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+    //            build_kv_store(gf, Kcur, Vcur, il);
 
-                struct ggml_tensor * k =
-                    ggml_view_3d(ctx0, kv_self.k_l[il],
-                            n_embd_head_k, n_kv, n_head_kv,
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                            0);
-                cb(k, "k", il);
+    //            struct ggml_tensor * k =
+    //                ggml_view_3d(ctx0, kv_self.k_l[il],
+    //                        n_embd_head_k, n_kv, n_head_kv,
+    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+    //                        ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+    //                        0);
+    //            cb(k, "k", il);
 
-                struct ggml_tensor * v =
-                    ggml_view_3d(ctx0, kv_self.v_l[il],
-                            n_kv, n_embd_head_v, n_head_kv,
-                            ggml_element_size(kv_self.v_l[il])*n_ctx,
-                            ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
-                            0);
-                cb(v, "v", il);
+    //            struct ggml_tensor * v =
+    //                ggml_view_3d(ctx0, kv_self.v_l[il],
+    //                        n_kv, n_embd_head_v, n_head_kv,
+    //                        ggml_element_size(kv_self.v_l[il])*n_ctx,
+    //                        ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+    //                        0);
+    //            cb(v, "v", il);
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
 
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-                cb(kq, "kq", il);
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
 
-                struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
-                struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
-                struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
-                cb(kq_b, "kq_b", il);
+    //            struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+    //            struct ggml_tensor * pos_bias = build_pos_bias(pos_bucket_dec, attn_rel_b);
+    //            struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+    //            cb(kq_b, "kq_b", il);
 
-                kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
-                cb(kq, "kq_soft_max_ext", il);
+    //            kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
 
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-                cb(kqv, "kqv", il);
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+    //            cb(kqv, "kqv", il);
 
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
 
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
 
-                ggml_build_forward_expand(gf, cur);
+    //            ggml_build_forward_expand(gf, cur);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
-                cb(cur, "kqv_out", il);
-            }
+    //            cur = build_lora_mm(model.layers[il].wo, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
 
-            cur = ggml_add(ctx0, cur, inpSA);
-            cb(cur, "cross_inp", il);
+    //        cur = ggml_add(ctx0, cur, inpSA);
+    //        cb(cur, "cross_inp", il);
 
-            struct ggml_tensor * inpCA = cur;
+    //        struct ggml_tensor * inpCA = cur;
 
-            // norm
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.layers[il].attn_norm_cross, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "attn_norm_cross", il);
+    //        // norm
+    //        cur = build_norm(cur,
+    //                model.layers[il].attn_norm_cross, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "attn_norm_cross", il);
 
-            // cross-attention
-            {
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
-                cb(Qcur, "Qcur", il);
+    //        // cross-attention
+    //        {
+    //            struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq_cross, cur);
+    //            cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
-                cb(Kcur, "Kcur", il);
+    //            struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk_cross, embd_enc);
+    //            cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
-                cb(Vcur, "Vcur", il);
+    //            struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv_cross, embd_enc);
+    //            cb(Vcur, "Vcur", il);
 
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+    //            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+    //            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
 
-                struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
-                struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+    //            struct ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+    //            struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
 
-                struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-                cb(kq, "kq", il);
+    //            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    //            cb(kq, "kq", il);
 
-                kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
-                cb(kq, "kq_soft_max_ext", il);
+    //            kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+    //            cb(kq, "kq_soft_max_ext", il);
 
-                struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
-                cb(v, "v", il);
+    //            struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+    //            cb(v, "v", il);
 
-                struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
-                cb(kqv, "kqv", il);
+    //            struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+    //            cb(kqv, "kqv", il);
 
-                struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-                cb(kqv_merged, "kqv_merged", il);
+    //            struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    //            cb(kqv_merged, "kqv_merged", il);
 
-                cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
-                cb(cur, "kqv_merged_cont", il);
+    //            cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+    //            cb(cur, "kqv_merged_cont", il);
 
-                ggml_build_forward_expand(gf, cur);
+    //            ggml_build_forward_expand(gf, cur);
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
-                cb(cur, "kqv_out", il);
-            }
+    //            cur = build_lora_mm(model.layers[il].wo_cross, cur);
+    //            cb(cur, "kqv_out", il);
+    //        }
 
-            if (il == n_layer - 1) {
-                // skip computing output for unused tokens
-                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-                inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
-            }
+    //        if (il == n_layer - 1) {
+    //            // skip computing output for unused tokens
+    //            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //            n_tokens = n_outputs;
+    //            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+    //            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+    //            inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+    //        }
 
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
-            cb(ffn_inp, "ffn_inp", il);
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+    //        cb(ffn_inp, "ffn_inp", il);
 
-            // feed-forward network
-            {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                        model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
-                cb(cur, "ffn_norm", il);
+    //        // feed-forward network
+    //        {
+    //            cur = build_norm(ffn_inp,
+    //                    model.layers[il].ffn_norm, NULL,
+    //                    LLM_NORM_RMS, il);
+    //            cb(cur, "ffn_norm", il);
 
-                // T5 uses relu, flan-T5 uses gelu-gated
-                cur = llm_build_ffn(ctx0, lctx, cur,
-                        model.layers[il].ffn_up,   NULL, NULL,
-                        model.layers[il].ffn_gate, NULL, NULL,
-                        model.layers[il].ffn_down, NULL, NULL,
-                        NULL,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
-                        model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
-                        cb, il);
-                cb(cur, "ffn_out", il);
-            }
+    //            // T5 uses relu, flan-T5 uses gelu-gated
+    //            cur = build_ffn(cur,
+    //                    model.layers[il].ffn_up,   NULL, NULL,
+    //                    model.layers[il].ffn_gate, NULL, NULL,
+    //                    model.layers[il].ffn_down, NULL, NULL,
+    //                    NULL,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+    //                    model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+    //                    cb, il);
+    //            cb(cur, "ffn_out", il);
+    //        }
 
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cb(cur, "ffn_out", il);
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cb(cur, "ffn_out", il);
 
-            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
-            if (layer_dir != nullptr) {
-                cur = ggml_add(ctx0, cur, layer_dir);
-            }
-            cb(cur, "l_out", il);
+    //        ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+    //        if (layer_dir != nullptr) {
+    //            cur = ggml_add(ctx0, cur, layer_dir);
+    //        }
+    //        cb(cur, "l_out", il);
 
-            // input for next layer
-            inpL = cur;
-        }
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
 
-        cur = inpL;
-        cb(cur, "result_embd", -1);
+    //    cur = inpL;
+    //    cb(cur, "result_embd", -1);
 
-        cur = llm_build_norm(ctx0, cur, hparams,
-                model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
-
-        // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
-
-        ggml_build_forward_expand(gf, cur);
+    //    cur = build_norm(cur,
+    //            model.output_norm, NULL,
+    //            LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
+
+    //    // lm_head
+    //    cur = build_lora_mm(model.output, cur);
+    //    cb(cur, "result_output", -1);
+
+    //    ggml_build_forward_expand(gf, cur);
 
-        return gf;
-    }
+    //    return gf;
+    //}
 
     struct ggml_cgraph * build_jais() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -7106,21 +6535,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -7136,9 +6564,9 @@ struct llm_build_context {
 
                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/float(n_embd_head), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -7154,13 +6582,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         model.layers[il].ffn_norm_b,
-                        LLM_NORM, cb, il);
+                        LLM_NORM, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -7173,13 +6601,13 @@ struct llm_build_context {
             cb(inpL, "l_out", il);
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
 
@@ -7198,21 +6626,20 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
@@ -7221,7 +6648,7 @@ struct llm_build_context {
                 struct ggml_tensor * Kcur = nullptr;
                 struct ggml_tensor * Vcur = nullptr;
 
-                cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
@@ -7249,9 +6676,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur_rope", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
             }
 
@@ -7268,13 +6695,13 @@ struct llm_build_context {
 
             // FF
             {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm,
                         NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
 
-                cur = llm_build_ffn(ctx0, lctx, cur,
+                cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
                         NULL,                      NULL, NULL,
                         model.layers[il].ffn_down, NULL, NULL,
@@ -7288,13 +6715,13 @@ struct llm_build_context {
             cb(inpL, "l_out", il);
         }
 
-        cur = llm_build_norm(ctx0, inpL, hparams,
+        cur = build_norm(inpL,
                 model.output_norm,
                 NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -7312,42 +6739,41 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm,
                     model.layers[il].attn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -7368,9 +6794,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -7384,13 +6810,13 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm,
                     model.layers[il].ffn_norm_b,
-                    LLM_NORM, cb, il);
+                    LLM_NORM, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                     NULL,                      NULL,                        NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
@@ -7409,13 +6835,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -7436,44 +6862,43 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
             // norm
-            cur = llm_build_norm(ctx0, inpL, hparams,
+            cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "attn_norm", il);
 
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = build_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
 
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -7494,9 +6919,9 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -7511,12 +6936,12 @@ struct llm_build_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+            cur = build_norm(ffn_inp,
                     model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
             cb(cur, "ffn_norm", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -7536,13 +6961,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output", -1);
 
         ggml_build_forward_expand(gf, cur);
@@ -7550,230 +6975,232 @@ struct llm_build_context {
         return gf;
     }
 
-    ggml_cgraph * build_rwkv6() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    //ggml_cgraph * build_rwkv6() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // Token shift state dimensions should be 2 * n_emb
-        GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
+    //    // Token shift state dimensions should be 2 * n_emb
+    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
 
-        const int64_t n_seqs = ubatch.n_seqs;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
-        GGML_ASSERT(n_seqs != 0);
-        GGML_ASSERT(ubatch.equal_seqs);
-        GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+    //    const int64_t n_seqs = ubatch.n_seqs;
+    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+    //    const int64_t n_tokens = ubatch.n_tokens;
+    //    GGML_ASSERT(n_seqs != 0);
+    //    GGML_ASSERT(ubatch.equal_seqs);
+    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
 
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-        struct ggml_tensor * state_copy = build_inp_s_copy();
-        struct ggml_tensor * state_mask = build_inp_s_mask();
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
+    //    struct ggml_tensor * state_copy = build_inp_s_copy();
+    //    struct ggml_tensor * state_mask = build_inp_s_mask();
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
-        inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
+    //    inpL = build_inp_embd(model.tok_embd);
+    //    inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        for (int il = 0; il < n_layer; ++il) {
-            const llama_layer * layer = &model.layers[il];
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        const llama_layer * layer = &model.layers[il];
 
-            // (ab)using the KV cache to store the states
-            struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.k_l[il], state_copy, state_mask,
-                    hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
-            struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.v_l[il], state_copy, state_mask,
-                    hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
+    //        // (ab)using the KV cache to store the states
+    //        struct ggml_tensor * token_shift = build_copy_mask_state(
+    //                gf, kv_self.k_l[il], state_copy, state_mask,
+    //                hparams.n_embd_k_s(), n_seqs);
 
-            cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-            token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
+    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
+    //                gf, kv_self.v_l[il], state_copy, state_mask,
+    //                hparams.n_embd_v_s(), n_seqs);
 
-            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
-            struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
 
-            struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
-            struct ggml_tensor * x_prev = ggml_concat(
-                ctx0,
-                att_shift,
-                ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-                1
-            );
+    //        struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+    //        struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
 
-            cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
-            ggml_build_forward_expand(gf, cur);
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    wkv_states,
-                    ggml_view_1d(
-                        ctx0,
-                        kv_self.v_l[il],
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-                    )
-                )
-            );
+    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+    //        struct ggml_tensor * x_prev = ggml_concat(
+    //            ctx0,
+    //            att_shift,
+    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+    //            1
+    //        );
 
-            struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
-            x_prev = ggml_concat(
-                ctx0,
-                ffn_shift,
-                ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
-                1
-            );
-            cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
-            ggml_build_forward_expand(gf, cur);
+    //        cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
+    //        ggml_build_forward_expand(gf, cur);
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                wkv_states,
+    //                ggml_view_1d(
+    //                    ctx0,
+    //                    kv_self.v_l[il],
+    //                    hparams.n_embd_v_s() * n_seqs,
+    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+    //                )
+    //            )
+    //        );
 
-            struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
-            struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
+    //        struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+    //        x_prev = ggml_concat(
+    //            ctx0,
+    //            ffn_shift,
+    //            ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
+    //            1
+    //        );
+    //        cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev));
+    //        ggml_build_forward_expand(gf, cur);
 
-            token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
+    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
+    //        struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
 
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
-                    ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
-                )
-            );
+    //        token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
 
-            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
-                cur = ggml_scale(ctx0, cur, 0.5F);
-            }
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
+    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+    //            )
+    //        );
 
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
-            cb(cur, "l_out", il);
+    //        if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+    //            cur = ggml_scale(ctx0, cur, 0.5F);
+    //        }
 
-            // input for next layer
-            inpL = cur;
-        }
+    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
+    //        cb(cur, "l_out", il);
 
-        cur = inpL;
-        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
 
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
+    //    cur = inpL;
+    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
+    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+    //    cb(cur, "result_norm", -1);
 
-        ggml_build_forward_expand(gf, cur);
+    //    cur = build_lora_mm(model.output, cur);
+    //    cb(cur, "result_output", -1);
 
-        return gf;
-    }
+    //    ggml_build_forward_expand(gf, cur);
+
+    //    return gf;
+    //}
 
     // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
-    ggml_cgraph * build_rwkv6qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    //ggml_cgraph * build_rwkv6qwen2() {
+    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s());
 
-        const int64_t n_seqs = ubatch.n_seqs;
-        const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
-        GGML_ASSERT(n_seqs != 0);
-        GGML_ASSERT(ubatch.equal_seqs);
-        GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+    //    const int64_t n_seqs = ubatch.n_seqs;
+    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+    //    const int64_t n_tokens = ubatch.n_tokens;
+    //    GGML_ASSERT(n_seqs != 0);
+    //    GGML_ASSERT(ubatch.equal_seqs);
+    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
 
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-        struct ggml_tensor * state_copy = build_inp_s_copy();
-        struct ggml_tensor * state_mask = build_inp_s_mask();
+    //    struct ggml_tensor * cur;
+    //    struct ggml_tensor * inpL;
+    //    struct ggml_tensor * state_copy = build_inp_s_copy();
+    //    struct ggml_tensor * state_mask = build_inp_s_mask();
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+    //    inpL = build_inp_embd(model.tok_embd);
 
-        for (int il = 0; il < n_layer; ++il) {
-            const llama_layer * layer = &model.layers[il];
+    //    for (int il = 0; il < n_layer; ++il) {
+    //        const llama_layer * layer = &model.layers[il];
 
-            // (ab)using the KV cache to store the states
-            struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.k_l[il], state_copy, state_mask,
-                    hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
-            struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
-                    gf, kv_self.v_l[il], state_copy, state_mask,
-                    hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
+    //        // (ab)using the KV cache to store the states
+    //        struct ggml_tensor * token_shift = build_copy_mask_state(
+    //                gf, kv_self.k_l[il], state_copy, state_mask,
+    //                hparams.n_embd_k_s(), n_seqs);
 
-            cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-            token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
+    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
+    //                gf, kv_self.v_l[il], state_copy, state_mask,
+    //                hparams.n_embd_v_s(), n_seqs);
 
-            struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il);
-            struct ggml_tensor * x_prev = ggml_concat(
-                ctx0,
-                token_shift,
-                ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-                1
-            );
+    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
 
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    wkv_states,
-                    ggml_view_1d(
-                        ctx0,
-                        kv_self.v_l[il],
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-                    )
-                )
-            );
+    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+    //        struct ggml_tensor * x_prev = ggml_concat(
+    //            ctx0,
+    //            token_shift,
+    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+    //            1
+    //        );
 
-            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
-            ggml_build_forward_expand(gf, ffn_inp);
-            ggml_build_forward_expand(
-                gf,
-                ggml_cpy(
-                    ctx0,
-                    wkv_states,
-                    ggml_view_1d(
-                        ctx0,
-                        kv_self.v_l[il],
-                        hparams.n_embd_v_s() * n_seqs,
-                        hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-                    )
-                )
-            );
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                wkv_states,
+    //                ggml_view_1d(
+    //                    ctx0,
+    //                    kv_self.v_l[il],
+    //                    hparams.n_embd_v_s() * n_seqs,
+    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+    //                )
+    //            )
+    //        );
 
-            cb(ffn_inp, "ffn_inp", il);
+    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
+    //        ggml_build_forward_expand(gf, ffn_inp);
+    //        ggml_build_forward_expand(
+    //            gf,
+    //            ggml_cpy(
+    //                ctx0,
+    //                wkv_states,
+    //                ggml_view_1d(
+    //                    ctx0,
+    //                    kv_self.v_l[il],
+    //                    hparams.n_embd_v_s() * n_seqs,
+    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+    //                )
+    //            )
+    //        );
 
-            // feed-forward network
-            cur = llm_build_norm(ctx0, ffn_inp, hparams,
-                    model.layers[il].ffn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
-            cb(cur, "ffn_norm", il);
+    //        cb(ffn_inp, "ffn_inp", il);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
-                    model.layers[il].ffn_up,   NULL, NULL,
-                    model.layers[il].ffn_gate, NULL, NULL,
-                    model.layers[il].ffn_down, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_out", il);
+    //        // feed-forward network
+    //        cur = build_norm(ffn_inp,
+    //                model.layers[il].ffn_norm, NULL,
+    //                LLM_NORM_RMS, il);
+    //        cb(cur, "ffn_norm", il);
 
-            cur = ggml_add(ctx0, cur, ffn_inp);
-            cur = lctx.cvec.apply_to(ctx0, cur, il);
-            cb(cur, "l_out", il);
+    //        cur = build_ffn(cur,
+    //                model.layers[il].ffn_up,   NULL, NULL,
+    //                model.layers[il].ffn_gate, NULL, NULL,
+    //                model.layers[il].ffn_down, NULL, NULL,
+    //                NULL,
+    //                LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+    //        cb(cur, "ffn_out", il);
 
-            // input for next layer
-            inpL = cur;
-        }
+    //        cur = ggml_add(ctx0, cur, ffn_inp);
+    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
+    //        cb(cur, "l_out", il);
 
-        cur = inpL;
-        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    //        // input for next layer
+    //        inpL = cur;
+    //    }
 
-        cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1);
-        cb(cur, "result_norm", -1);
+    //    cur = inpL;
+    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output", -1);
+    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+    //    cb(cur, "result_norm", -1);
 
-        ggml_build_forward_expand(gf, cur);
+    //    cur = build_lora_mm(model.output, cur);
+    //    cb(cur, "result_output", -1);
 
-        return gf;
-    }
+    //    ggml_build_forward_expand(gf, cur);
+
+    //    return gf;
+    //}
 
     // ref: https://github.com/facebookresearch/chameleon
     // based on the original build_llama() function, changes:
@@ -7794,13 +7221,12 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        lctx.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7809,22 +7235,22 @@ struct llm_build_context {
             if (hparams.swin_norm) {
                 cur = inpL;
             } else {
-                cur = llm_build_norm(ctx0, inpL, hparams,
+                cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
-                    LLM_NORM_RMS, cb, il);
+                    LLM_NORM_RMS, il);
                 cb(cur, "attn_norm", il);
             }
 
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 if (model.layers[il].attn_q_norm) {
@@ -7834,10 +7260,10 @@ struct llm_build_context {
                                 0);
                     cb(Qcur, "Qcur", il);
 
-                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                    Qcur = build_norm(Qcur,
                                 model.layers[il].attn_q_norm,
                                 model.layers[il].attn_q_norm_b,
-                                LLM_NORM, cb, il);
+                                LLM_NORM, il);
                     cb(Qcur, "Qcur", il);
                 }
 
@@ -7848,10 +7274,10 @@ struct llm_build_context {
                                 0);
                     cb(Kcur, "Kcur", il);
 
-                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                    Kcur = build_norm(Kcur,
                                model.layers[il].attn_k_norm,
                                model.layers[il].attn_k_norm_b,
-                               LLM_NORM, cb, il);
+                               LLM_NORM, il);
                     cb(Kcur, "Kcur", il);
                 }
 
@@ -7869,14 +7295,14 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                cur = build_attn(gf,
                         model.layers[il].wo, nullptr,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, n_tokens, 1.0f/sqrtf(float(n_embd_head)), cb, il);
 
                 if (hparams.swin_norm) {
-                    cur = llm_build_norm(ctx0, cur, hparams,
+                    cur = build_norm(cur,
                         model.layers[il].attn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 }
             }
 
@@ -7893,13 +7319,13 @@ struct llm_build_context {
 
             // feed-forward network
             if (!hparams.swin_norm) {
-                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                cur = build_norm(ffn_inp,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
             }
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     model.layers[il].ffn_up,   NULL, NULL,
                     model.layers[il].ffn_gate, NULL, NULL,
                     model.layers[il].ffn_down, NULL, NULL,
@@ -7908,9 +7334,9 @@ struct llm_build_context {
             cb(cur, "ffn_out", il);
 
             if (hparams.swin_norm) {
-                cur = llm_build_norm(ctx0, cur, hparams,
+                cur = build_norm(cur,
                         model.layers[il].ffn_norm, NULL,
-                        LLM_NORM_RMS, cb, il);
+                        LLM_NORM_RMS, il);
                 cb(cur, "ffn_norm", il);
             }
 
@@ -7926,13 +7352,13 @@ struct llm_build_context {
 
         cur = inpL;
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm, NULL,
-                LLM_NORM_RMS, cb, -1);
+                LLM_NORM_RMS, -1);
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output_with_img_logits", -1);
 
         // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
@@ -7959,7 +7385,7 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        inpL = build_inp_embd(model.tok_embd);
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
 
@@ -7978,20 +7404,20 @@ struct llm_build_context {
                 case 3:
                 case 4:
                     {
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.norm1,
                                 layer.norm1_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
 
                         cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
                         cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
                         cur = ggml_add(ctx0, cur, layer.conv1_b);
 
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.norm2,
                                 layer.norm2_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
 
                         cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
@@ -8002,10 +7428,10 @@ struct llm_build_context {
                     } break;
                 case 2:
                     {
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.attn_norm,
                                 layer.attn_norm_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
 
                         struct ggml_tensor * q;
                         struct ggml_tensor * k;
@@ -8035,10 +7461,10 @@ struct llm_build_context {
                     } break;
                 case 5:
                     {
-                        cur = llm_build_norm(ctx0, cur, hparams,
+                        cur = build_norm(cur,
                                 layer.norm,
                                 layer.norm_b,
-                                LLM_NORM_GROUP, cb, 0);
+                                LLM_NORM_GROUP, 0);
                     } break;
                 default: GGML_ABORT("unknown posnet layer");
             };
@@ -8046,10 +7472,10 @@ struct llm_build_context {
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.tok_norm,
                 model.tok_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
@@ -8066,12 +7492,12 @@ struct llm_build_context {
 
             cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
-            cur = llm_build_norm(ctx0, cur, hparams,
+            cur = build_norm(cur,
                     layer.norm,
                     layer.norm_b,
-                    LLM_NORM, cb, -1);
+                    LLM_NORM, -1);
 
-            cur = llm_build_ffn(ctx0, lctx, cur,
+            cur = build_ffn(cur,
                     layer.pw1, layer.pw1_b, NULL,
                     NULL,      NULL,        NULL,
                     layer.pw2, layer.pw2_b, NULL,
@@ -8089,13 +7515,13 @@ struct llm_build_context {
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = build_norm(cur,
                 model.output_norm,
                 model.output_norm_b,
-                LLM_NORM, cb, -1);
+                LLM_NORM, -1);
 
         // lm_head
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cur = build_lora_mm(model.output, cur);
 
         cur = ggml_add(ctx0, cur, model.output_b);
         cb(cur, "result_embd", -1);
@@ -8106,7 +7532,7 @@ struct llm_build_context {
     }
 };
 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) {
     llama_ubatch dummy = {};
     dummy.equal_seqs = true;
 
@@ -8116,7 +7542,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
 
     llm.init();
 
-    struct ggml_cgraph * result = llm.build_defrag(ids);
+    struct ggml_cgraph * result = llm.build_defrag();
 
     llm.free();
 
@@ -8356,18 +7782,18 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_bitnet();
             } break;
-        case LLM_ARCH_T5:
-            {
-                if (lctx.is_encoding) {
-                    result = llm.build_t5_enc();
-                } else {
-                    result = llm.build_t5_dec();
-                }
-            } break;
-        case LLM_ARCH_T5ENCODER:
-            {
-                result = llm.build_t5_enc();
-            } break;
+        //case LLM_ARCH_T5:
+        //    {
+        //        if (lctx.is_encoding) {
+        //            result = llm.build_t5_enc();
+        //        } else {
+        //            result = llm.build_t5_dec();
+        //        }
+        //    } break;
+        //case LLM_ARCH_T5ENCODER:
+        //    {
+        //        result = llm.build_t5_enc();
+        //    } break;
         case LLM_ARCH_JAIS:
             {
                 result = llm.build_jais();
@@ -8380,14 +7806,14 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_exaone();
             } break;
-        case LLM_ARCH_RWKV6:
-            {
-                result = llm.build_rwkv6();
-            } break;
-        case LLM_ARCH_RWKV6QWEN2:
-            {
-                result = llm.build_rwkv6qwen2();
-            } break;
+        //case LLM_ARCH_RWKV6:
+        //    {
+        //        result = llm.build_rwkv6();
+        //    } break;
+        //case LLM_ARCH_RWKV6QWEN2:
+        //    {
+        //        result = llm.build_rwkv6qwen2();
+        //    } break;
         case LLM_ARCH_CHAMELEON:
             {
                 result = llm.build_chameleon();
@@ -8543,6 +7969,7 @@ static int llama_decode_impl(
         } else {
             ubatch = lctx.sbatch.split_simple(n_ubatch);
         }
+
         const uint32_t n_tokens = ubatch.n_tokens;
 
         // count the outputs in this u_batch
@@ -8567,6 +7994,8 @@ static int llama_decode_impl(
 
         GGML_ASSERT(n_threads > 0);
 
+        lctx.prepare_decode(ubatch);
+
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
             llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update()
@@ -8600,6 +8029,12 @@ static int llama_decode_impl(
 
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+
+        lctx.set_inputs(ubatch);
+
         // the output is always the last tensor in the graph
         struct ggml_tensor * res  = ggml_graph_node(gf, -1);
         struct ggml_tensor * embd = ggml_graph_node(gf, -2);
@@ -8623,12 +8058,6 @@ static int llama_decode_impl(
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
 
-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
-
-        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-        llama_set_inputs(lctx, ubatch);
-
         const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
         if (compute_status != GGML_STATUS_SUCCESS) {
             kv_slot_restorer.restore(kv_self);
@@ -8850,11 +8279,17 @@ static int llama_encode_impl(
 
     GGML_ASSERT(n_threads > 0);
 
+    lctx.prepare_decode(ubatch);
+
     ggml_backend_sched_reset(lctx.sched.get());
     ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
     ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
+    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
+
+    lctx.set_inputs(ubatch);
+
     // the output embeddings after the final encoder normalization
     struct ggml_tensor * embd = nullptr;
 
@@ -8875,10 +8310,6 @@ static int llama_encode_impl(
         }
     }
 
-    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-    llama_set_inputs(lctx, ubatch);
-
     const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
     switch (compute_status) {
         case GGML_STATUS_SUCCESS:
@@ -8966,227 +8397,6 @@ static int llama_encode_impl(
     return 0;
 }
 
-// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
-    auto & kv_self = lctx.kv_self;
-
-    const auto & hparams = lctx.model.hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    const uint32_t n_kv   = kv_self.cell_max();
-    const uint32_t n_used = kv_self.used;
-
-    assert(n_used <= n_kv);
-
-    //const int64_t t_start = ggml_time_us();
-
-    // number of cells moved
-    uint32_t n_moves = 0;
-
-    // each move requires 6*n_layer tensors (see build_defrag)
-    //   - source view, destination view, copy operation
-    //   - x2 for keys and values
-    //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
-    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer);
-
-    // determine which KV cells to move where
-    //
-    //  cell i moves to ids[i]
-    //
-    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
-    //
-    std::vector<uint32_t> ids(n_kv, n_kv);
-
-    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
-        const auto & cell0 = kv_self.cells[i0];
-
-        if (!cell0.is_empty()) {
-            ids[i0] = i0;
-
-            continue;
-        }
-
-        // found a hole - fill it with data from the end of the cache
-
-        uint32_t nh = 1;
-
-        // determine the size of the hole
-        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
-            nh++;
-        }
-
-        uint32_t nf = 0;
-        uint32_t is = n_kv - 1;
-
-        // starting from the end, find nh non-empty cells
-        for (; is > i0; --is) {
-            const auto & cell1 = kv_self.cells[is];
-
-            if (cell1.is_empty() || ids[is] != n_kv) {
-                continue;
-            }
-
-            // non-empty cell which is not yet moved
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        // this can only happen if `n_used` is not accurate, which would be a bug
-        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
-
-        nf = 0;
-
-        uint32_t i1 = is;
-
-        // are we moving a continuous block of memory?
-        bool cont = false;
-
-        // should we stop searching for the next move?
-        bool stop = false;
-
-        // go back and move the nf cells to the hole
-        for (; i1 < n_kv; ++i1) {
-            auto & cell1 = kv_self.cells[i1];
-
-            if (cell1.is_empty() || ids[i1] != n_kv) {
-                if (n_moves == max_moves) {
-                    stop = true;
-                    break;
-                }
-
-                cont = false;
-                continue;
-            }
-
-            // this cell goes to (i0 + nf)
-            ids[i1] = i0 + nf;
-
-            // move the cell meta data
-            kv_self.cells[i0 + nf] = cell1;
-
-            // clear the old cell and move the head there
-            cell1 = llama_kv_cell();
-            kv_self.head = n_used;
-
-            if (!cont) {
-                n_moves++;
-                cont = true;
-            }
-
-            nf++;
-
-            if (nf == nh) {
-                break;
-            }
-        }
-
-        if (stop || n_moves == max_moves) {
-            break;
-        }
-
-        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
-
-        i0 += nh - 1;
-    }
-
-    if (n_moves == 0) {
-        return;
-    }
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
-
-    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
-
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have ggml_graph based defrag
-    //
-
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-
-    const uint32_t kv_size = kv_self.size;
-
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
-
-        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
-        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
-
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-
-        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-
-            if (i == id || id == n_kv) {
-                continue;
-            }
-
-            uint32_t nm = 1;
-
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-
-            i += nm - 1;
-        }
-
-        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
-        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    // ggml_graph defrag
-
-    ggml_backend_sched_reset(lctx.sched.get());
-
-    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
-
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-#endif
-
-    //const int64_t t_end = ggml_time_us();
-
-    //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
-}
-
 // TODO: move to llama_context
 static void llama_kv_self_update_impl(llama_context & lctx) {
     bool need_reserve = false;
@@ -9200,13 +8410,15 @@ static void llama_kv_self_update_impl(llama_context & lctx) {
 
         // apply K-shift if needed
         if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            lctx.prepare_k_shift();
+
             ggml_backend_sched_reset(lctx.sched.get());
 
             ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
 
             ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
 
-            lctx.set_k_shift(kv);
+            lctx.set_inputs({});
 
             llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
@@ -9224,7 +8436,13 @@ static void llama_kv_self_update_impl(llama_context & lctx) {
 
     // defragment the KV cache if needed
     if (kv.do_defrag) {
-        llama_kv_cache_defrag_impl(lctx);
+        lctx.prepare_defrag();
+
+        ggml_backend_sched_reset(lctx.sched.get());
+
+        ggml_cgraph * gf = llama_build_graph_defrag(lctx);
+
+        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
         need_reserve = true;
 
@@ -9253,16 +8471,16 @@ int32_t llama_set_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter,
             float scale) {
-    ctx->lora[adapter] = scale;
+    ctx->loras[adapter] = scale;
     return 0;
 }
 
 int32_t llama_rm_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter) {
-    auto pos = ctx->lora.find(adapter);
-    if (pos != ctx->lora.end()) {
-        ctx->lora.erase(pos);
+    auto pos = ctx->loras.find(adapter);
+    if (pos != ctx->loras.end()) {
+        ctx->loras.erase(pos);
         return 0;
     }
 
@@ -9270,7 +8488,7 @@ int32_t llama_rm_adapter_lora(
 }
 
 void llama_clear_adapter_lora(struct llama_context * ctx) {
-    ctx->lora.clear();
+    ctx->loras.clear();
 }
 
 int32_t llama_apply_adapter_cvec(

From b4ec1d44294b628a811cc97367bb7ace0a32c9fd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 16 Jan 2025 21:55:12 +0200
Subject: [PATCH 12/28] cont : move kv_self update to llama_context

ggml-ci
---
 src/llama-context.cpp | 119 +++++++++++++++++++++++++++
 src/llama-context.h   |  10 +++
 src/llama.cpp         | 182 +++++++-----------------------------------
 3 files changed, 157 insertions(+), 154 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 910e2243d..daea125fe 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -32,6 +32,38 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+enum ggml_status llama_context::compute_graph(
+            ggml_cgraph * graph,
+                   bool   batched) {
+    int n_threads        = batched ? cparams.n_threads_batch : cparams.n_threads;
+    ggml_threadpool_t tp = batched ? threadpool_batch        : threadpool;
+
+    if (backend_cpu != nullptr) {
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(backend_cpu, tp);
+    }
+
+    // set the number of threads for all the backends
+    for (const auto & set_n_threads_fn : set_n_threads_fns) {
+        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
+    }
+
+    auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
+    if (status != GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
+    }
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
+
+    return status;
+}
+
+
+llama_pos llama_context::pos_max() const {
+    return kv_self.pos_max();
+}
+
 // TODO: improve
 void llama_context::reset() {
     inp_tokens          = nullptr;
@@ -540,6 +572,93 @@ ggml_tensor * llama_context::build_lora_mm_id(
     return res;
 }
 
+bool llama_context::kv_self_update() {
+    bool need_reserve = false;
+
+    auto & kv = kv_self;
+
+    if (kv.has_shift) {
+        if (!kv.can_shift) {
+            GGML_ABORT("The current context does not support K-shift");
+        }
+
+        // apply K-shift if needed
+        if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
+            prepare_k_shift();
+
+            ggml_backend_sched_reset(sched.get());
+
+            struct ggml_init_params params = {
+                /*.mem_size   =*/ buf_compute_meta.size(),
+                /*.mem_buffer =*/ buf_compute_meta.data(),
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx0 = ggml_init(params);
+
+            reset();
+
+            ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+            build_k_shift(ctx0, gf);
+
+            ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+            set_inputs({});
+
+            compute_graph(gf, false);
+
+            ggml_free(ctx0);
+
+            need_reserve = true;
+        }
+
+        {
+            kv.has_shift = false;
+
+            for (uint32_t i = 0; i < kv.size; ++i) {
+                kv.cells[i].delta = 0;
+            }
+        }
+    }
+
+    // defragment the KV cache if needed
+    if (kv.do_defrag) {
+        prepare_defrag();
+
+        ggml_backend_sched_reset(sched.get());
+
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ buf_compute_meta.size(),
+            /*.mem_buffer =*/ buf_compute_meta.data(),
+            /*.no_alloc   =*/ true,
+        };
+
+        ggml_context * ctx0 = ggml_init(params);
+
+        reset();
+
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+        build_defrag(ctx0, gf);
+
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+        // no input
+        //set_inputs({});
+
+        compute_graph(gf, false);
+
+        ggml_free(ctx0);
+
+        need_reserve = true;
+
+        kv.do_defrag = false;
+    }
+
+    return need_reserve;
+}
+
 void llama_context::build_attn_inp(
         ggml_context * ctx0,
              int32_t   n_tokens,
diff --git a/src/llama-context.h b/src/llama-context.h
index a2f41b5c8..bc33fc6ef 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -79,6 +79,13 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    // returns the result of ggml_backend_sched_graph_compute_async execution
+    enum ggml_status compute_graph(
+                ggml_cgraph * graph,
+                       bool   batched);
+
+    llama_pos pos_max() const;
+
     void reset();
 
     void prepare_k_shift();
@@ -129,6 +136,9 @@ struct llama_context {
     struct ggml_tensor * inp_KQ_mask_cross;   // F32 [n_outputs_enc, n_batch]
     struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
 
+    // return true if need to reserve new worst-case graph
+    bool kv_self_update();
+
     void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
diff --git a/src/llama.cpp b/src/llama.cpp
index a2e5e0bea..6e2faa71c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -110,7 +110,6 @@ struct llm_build_context {
     const llama_hparams      & hparams;
     const llama_cparams      & cparams;
     const llama_ubatch       & ubatch;
-    //const llama_kv_cache     & kv_self;
     const llama_adapter_cvec & cvec;
     const llama_loras        & loras;
 
@@ -137,8 +136,6 @@ struct llm_build_context {
     const float norm_rms_eps;
 
     const int32_t n_tokens;
-    //const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
-    //const int32_t kv_head;  // index of where we store new KV data in the cache
     const int32_t n_outputs;
     const int32_t n_outputs_enc;
     const int32_t n_ctx_orig;
@@ -166,7 +163,6 @@ struct llm_build_context {
         hparams          (model.hparams),
         cparams          (lctx.cparams),
         ubatch           (ubatch),
-        //kv_self          (lctx.kv_self),
         cvec             (lctx.cvec),
         loras            (lctx.loras),
         n_embd           (hparams.n_embd),
@@ -190,8 +186,6 @@ struct llm_build_context {
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (ubatch.n_tokens),
-        //n_kv             (worst_case ? kv_self.size : kv_self.n),
-        //kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
         n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
         n_ctx_orig       (cparams.n_ctx_orig_yarn),
@@ -7532,40 +7526,6 @@ struct llm_build_context {
     }
 };
 
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx) {
-    llama_ubatch dummy = {};
-    dummy.equal_seqs = true;
-
-    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
-
-    struct llm_build_context llm(lctx, dummy, cb, false);
-
-    llm.init();
-
-    struct ggml_cgraph * result = llm.build_defrag();
-
-    llm.free();
-
-    return result;
-}
-
-static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
-    llama_ubatch dummy = {};
-    dummy.equal_seqs = true;
-
-    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
-
-    struct llm_build_context llm(lctx, dummy, cb, false);
-
-    llm.init();
-
-    struct ggml_cgraph * result = llm.build_k_shift();
-
-    llm.free();
-
-    return result;
-}
-
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
     const llama_ubatch & ubatch,
@@ -7836,33 +7796,6 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
-// returns the result of ggml_backend_sched_graph_compute_async execution
-static enum ggml_status llama_graph_compute(
-          llama_context & lctx,
-            ggml_cgraph * gf,
-                    int   n_threads,
-        ggml_threadpool * threadpool) {
-    if (lctx.backend_cpu != nullptr) {
-        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
-        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
-        set_threadpool_fn(lctx.backend_cpu, threadpool);
-    }
-
-    // set the number of threads for all the backends
-    for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
-        set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
-    }
-
-    auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
-    if (status != GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
-    }
-
-    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-
-    return status;
-}
-
 // decode a batch of tokens by evaluating the transformer
 // in case of unsuccessful decoding (error or warning),
 // the kv_cache state will be returned to its original state
@@ -7887,7 +7820,7 @@ static int llama_decode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens_all = batch.n_tokens;
@@ -7989,16 +7922,11 @@ static int llama_decode_impl(
             lctx.n_outputs = n_outputs_new;
         }
 
-        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-        ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
-
-        GGML_ASSERT(n_threads > 0);
-
         lctx.prepare_decode(ubatch);
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_kv_self_update(&lctx); // TODO: lctx->kv_self_update()
+            llama_kv_self_update(&lctx);
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -8058,7 +7986,7 @@ static int llama_decode_impl(
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
 
-        const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+        const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
             kv_slot_restorer.restore(kv_self);
             switch (compute_status) {
@@ -8226,7 +8154,7 @@ static int llama_encode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
     const uint32_t n_tokens = batch.n_tokens;
@@ -8274,11 +8202,6 @@ static int llama_encode_impl(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-    ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
-
-    GGML_ASSERT(n_threads > 0);
-
     lctx.prepare_decode(ubatch);
 
     ggml_backend_sched_reset(lctx.sched.get());
@@ -8310,7 +8233,7 @@ static int llama_encode_impl(
         }
     }
 
-    const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+    const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
     switch (compute_status) {
         case GGML_STATUS_SUCCESS:
             break;
@@ -8397,76 +8320,6 @@ static int llama_encode_impl(
     return 0;
 }
 
-// TODO: move to llama_context
-static void llama_kv_self_update_impl(llama_context & lctx) {
-    bool need_reserve = false;
-
-    auto & kv = lctx.kv_self;
-
-    if (kv.has_shift) {
-        if (!kv.can_shift) {
-            GGML_ABORT("The current context does not support K-shift");
-        }
-
-        // apply K-shift if needed
-        if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            lctx.prepare_k_shift();
-
-            ggml_backend_sched_reset(lctx.sched.get());
-
-            ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
-
-            ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-            lctx.set_inputs({});
-
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-
-            need_reserve = true;
-        }
-
-        {
-            kv.has_shift = false;
-
-            for (uint32_t i = 0; i < kv.size; ++i) {
-                kv.cells[i].delta = 0;
-            }
-        }
-    }
-
-    // defragment the KV cache if needed
-    if (kv.do_defrag) {
-        lctx.prepare_defrag();
-
-        ggml_backend_sched_reset(lctx.sched.get());
-
-        ggml_cgraph * gf = llama_build_graph_defrag(lctx);
-
-        llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
-
-        need_reserve = true;
-
-        kv.do_defrag = false;
-    }
-
-    // reserve a worst case graph again
-    if (need_reserve) {
-        // TODO: extract to a function
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
-        llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(lctx.sched.get());
-        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-    }
-}
-
 int32_t llama_set_adapter_lora(
             struct llama_context * ctx,
             struct llama_adapter_lora * adapter,
@@ -9224,9 +9077,30 @@ void llama_kv_cache_update(llama_context * ctx) {
     llama_kv_self_update(ctx);
 }
 
-// TODO: move to llama-context
 void llama_kv_self_update(llama_context * ctx) {
-    llama_kv_self_update_impl(*ctx);
+    const bool need_reserve = ctx->kv_self_update();
+
+    // reserve a worst case graph again
+    if (need_reserve) {
+        // TODO: extract to a function
+        const auto & cparams = ctx->cparams;
+        const auto & model   = ctx->model;
+
+        // build worst-case graph
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+        ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true);
+
+        // initialize scheduler with the worst-case graph
+        ggml_backend_sched_reset(ctx->sched.get());
+        if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        }
+    }
 }
 
 ///

From f0713498fd05afe117647c76f536866640b77b90 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 11:51:35 +0200
Subject: [PATCH 13/28] context : add get_ctx_padding()

ggml-ci
---
 src/llama-context.cpp | 4 ++++
 src/llama-context.h   | 3 +++
 src/llama.cpp         | 4 +++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index daea125fe..6a73659d0 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -64,6 +64,10 @@ llama_pos llama_context::pos_max() const {
     return kv_self.pos_max();
 }
 
+uint32_t llama_context::get_ctx_padding(const llama_cparams & cparams) const {
+    return kv_self.get_padding(cparams);
+}
+
 // TODO: improve
 void llama_context::reset() {
     inp_tokens          = nullptr;
diff --git a/src/llama-context.h b/src/llama-context.h
index bc33fc6ef..45eaafaad 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -84,8 +84,11 @@ struct llama_context {
                 ggml_cgraph * graph,
                        bool   batched);
 
+    // max token position across all sequences in the current context
     llama_pos pos_max() const;
 
+    uint32_t get_ctx_padding(const llama_cparams & cparams) const;
+
     void reset();
 
     void prepare_k_shift();
diff --git a/src/llama.cpp b/src/llama.cpp
index 6e2faa71c..569c67c02 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7820,6 +7820,7 @@ static int llama_decode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
     llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
@@ -8154,6 +8155,7 @@ static int llama_encode_impl(
     }
 
     // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
     llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
@@ -8629,7 +8631,7 @@ struct llama_context * llama_init_from_model(
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
     // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->kv_self.get_padding(cparams));
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams));
 
     // with causal attention, the batch size is limited by the context size
     cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;

From c75ba6851e1f6079ff7c823672908a2e5767418a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 12:41:16 +0200
Subject: [PATCH 14/28] context : move adapter code in the implementation [no
 ci]

---
 src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++++++
 src/llama.cpp         | 40 +++++-----------------------------------
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 6a73659d0..5cb31abc0 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1788,6 +1788,43 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
     return it->second.data();
 }
 
+// llama adapter API
+
+int32_t llama_set_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter,
+            float scale) {
+    ctx->loras[adapter] = scale;
+    return 0;
+}
+
+int32_t llama_rm_adapter_lora(
+            struct llama_context * ctx,
+            struct llama_adapter_lora * adapter) {
+    auto pos = ctx->loras.find(adapter);
+    if (pos != ctx->loras.end()) {
+        ctx->loras.erase(pos);
+        return 0;
+    }
+
+    return -1;
+}
+
+void llama_clear_adapter_lora(struct llama_context * ctx) {
+    ctx->loras.clear();
+}
+
+int32_t llama_apply_adapter_cvec(
+        struct llama_context * ctx,
+                 const float * data,
+                      size_t   len,
+                     int32_t   n_embd,
+                     int32_t   il_start,
+                     int32_t   il_end) {
+    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
+}
+
+
 // llama state API
 
 // deprecated
diff --git a/src/llama.cpp b/src/llama.cpp
index 569c67c02..b80b1c4d1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8322,40 +8322,6 @@ static int llama_encode_impl(
     return 0;
 }
 
-int32_t llama_set_adapter_lora(
-            struct llama_context * ctx,
-            struct llama_adapter_lora * adapter,
-            float scale) {
-    ctx->loras[adapter] = scale;
-    return 0;
-}
-
-int32_t llama_rm_adapter_lora(
-            struct llama_context * ctx,
-            struct llama_adapter_lora * adapter) {
-    auto pos = ctx->loras.find(adapter);
-    if (pos != ctx->loras.end()) {
-        ctx->loras.erase(pos);
-        return 0;
-    }
-
-    return -1;
-}
-
-void llama_clear_adapter_lora(struct llama_context * ctx) {
-    ctx->loras.clear();
-}
-
-int32_t llama_apply_adapter_cvec(
-        struct llama_context * ctx,
-                 const float * data,
-                      size_t   len,
-                     int32_t   n_embd,
-                     int32_t   il_start,
-                     int32_t   il_end) {
-    return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
-}
-
 //
 // interface implementation
 //
@@ -8924,7 +8890,7 @@ struct llama_context * llama_new_context_with_model(
 }
 
 //
-// kv cache
+// kv cache view
 //
 
 struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
@@ -8935,6 +8901,10 @@ void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view *
     llama_kv_cache_view_update(view, ctx->kv_self);
 }
 
+//
+// kv cache
+//
+
 // deprecated
 int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
     return llama_kv_self_n_tokens(ctx);

From 133ad6a7232914459afc902107a53342d3abfb3b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 14:42:09 +0200
Subject: [PATCH 15/28] context : initial need_reserve logic

ggml-ci
---
 src/llama-context.cpp | 173 +++++++++++++++++++++-
 src/llama-context.h   |   4 +-
 src/llama.cpp         | 337 +++++++++++++-----------------------------
 3 files changed, 269 insertions(+), 245 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5cb31abc0..d696090cc 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -576,9 +576,7 @@ ggml_tensor * llama_context::build_lora_mm_id(
     return res;
 }
 
-bool llama_context::kv_self_update() {
-    bool need_reserve = false;
-
+void llama_context::kv_self_update() {
     auto & kv = kv_self;
 
     if (kv.has_shift) {
@@ -655,12 +653,14 @@ bool llama_context::kv_self_update() {
 
         ggml_free(ctx0);
 
-        need_reserve = true;
-
         kv.do_defrag = false;
-    }
 
-    return need_reserve;
+        need_reserve = true;
+    }
+}
+
+void llama_kv_self_update(llama_context * ctx) {
+    ctx->kv_self_update();
 }
 
 void llama_context::build_attn_inp(
@@ -1824,6 +1824,165 @@ int32_t llama_apply_adapter_cvec(
     return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
 }
 
+//
+// kv cache view
+//
+
+struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
+    return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
+}
+
+void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
+    llama_kv_cache_view_update(view, ctx->kv_self);
+}
+
+//
+// kv cache
+//
+
+// deprecated
+int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
+    return llama_kv_self_n_tokens(ctx);
+}
+
+int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
+    return llama_kv_cache_n_tokens(&ctx->kv_self);
+}
+
+// deprecated
+int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
+    return llama_kv_self_used_cells(ctx);
+}
+
+int32_t llama_kv_self_used_cells(const llama_context * ctx) {
+    return llama_kv_cache_used_cells(&ctx->kv_self);
+}
+
+// deprecated
+void llama_kv_cache_clear(llama_context * ctx) {
+    llama_kv_self_clear(ctx);
+}
+
+void llama_kv_self_clear(llama_context * ctx) {
+    llama_kv_cache_clear(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
+}
+
+bool llama_kv_self_seq_rm(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_self_seq_cp(
+        llama_context * ctx,
+         llama_seq_id   seq_id_src,
+         llama_seq_id   seq_id_dst,
+            llama_pos   p0,
+            llama_pos   p1) {
+    return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
+}
+
+// deprecated
+void llama_kv_cache_seq_keep(
+        llama_context * ctx,
+         llama_seq_id   seq_id) {
+    return llama_kv_self_seq_keep(ctx, seq_id);
+}
+
+void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
+}
+
+void llama_kv_self_seq_add(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+            llama_pos   delta) {
+    return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta);
+}
+
+// deprecated
+void llama_kv_cache_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+}
+
+void llama_kv_self_seq_div(
+        llama_context * ctx,
+         llama_seq_id   seq_id,
+            llama_pos   p0,
+            llama_pos   p1,
+                  int   d) {
+    return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d);
+}
+
+// deprecated
+llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_self_seq_pos_max(ctx, seq_id);
+}
+
+llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
+    return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id);
+}
+
+// deprecated
+void llama_kv_cache_defrag(llama_context * ctx) {
+    return llama_kv_self_defrag(ctx);
+}
+
+void llama_kv_self_defrag(llama_context * ctx) {
+    return llama_kv_cache_defrag(&ctx->kv_self);
+}
+
+// deprecated
+bool llama_kv_cache_can_shift(const llama_context * ctx) {
+    return llama_kv_self_can_shift(ctx);
+}
+
+bool llama_kv_self_can_shift(const llama_context * ctx) {
+    return llama_kv_cache_can_shift(&ctx->kv_self);
+}
+
+// deprecated
+void llama_kv_cache_update(llama_context * ctx) {
+    llama_kv_self_update(ctx);
+}
 
 // llama state API
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 45eaafaad..eb9a17391 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -62,6 +62,7 @@ struct llama_context {
     int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
     bool logits_all = false;
+    bool need_reserve = false;
 
     // embeddings output (2-dimensional array: [n_outputs][n_embd])
     // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
@@ -87,6 +88,7 @@ struct llama_context {
     // max token position across all sequences in the current context
     llama_pos pos_max() const;
 
+    // certain implementations could require a padding for the context size
     uint32_t get_ctx_padding(const llama_cparams & cparams) const;
 
     void reset();
@@ -140,7 +142,7 @@ struct llama_context {
     struct ggml_tensor * inp_K_shift;         // I32 [kv_size]
 
     // return true if need to reserve new worst-case graph
-    bool kv_self_update();
+    void kv_self_update();
 
     void build_attn_inp(
             ggml_context * ctx0,
diff --git a/src/llama.cpp b/src/llama.cpp
index b80b1c4d1..5807fa388 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -28,57 +28,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
-    // loading time will be recalculated after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = 0;
-    time_meas tm(model.t_load_us);
-
-    model.t_start_us = tm.t_start_us;
-
-    try {
-        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
-
-        ml.print_info();
-
-        model.hparams.vocab_only = params.vocab_only;
-
-        try {
-            model.load_arch(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
-        }
-        try {
-            model.load_hparams(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
-        }
-        try {
-            model.load_vocab(ml);
-        } catch(const std::exception & e) {
-            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
-        }
-
-        model.load_stats(ml);
-        model.print_info();
-
-        if (params.vocab_only) {
-            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return 0;
-        }
-
-        if (!model.load_tensors(ml)) {
-            return -2;
-        }
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
-        return -1;
-    }
-
-    return 0;
-}
-
 //
 // llm_build
 //
@@ -7951,6 +7900,30 @@ static int llama_decode_impl(
             }
         }
 
+        // reserve a worst case graph if needed
+        // TODO: extract to a function
+        if (lctx.need_reserve) {
+            const auto & cparams = lctx.cparams;
+            const auto & model   = lctx.model;
+
+            // build worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+            ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
+
+            // initialize scheduler with the worst-case graph
+            ggml_backend_sched_reset(lctx.sched.get());
+            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            }
+
+            lctx.need_reserve = false;
+        }
+
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
         ggml_backend_sched_reset(lctx.sched.get());
@@ -8206,6 +8179,31 @@ static int llama_encode_impl(
 
     lctx.prepare_decode(ubatch);
 
+    // reserve a worst case graph if needed
+    // TODO: extract to a function
+    if (lctx.need_reserve) {
+        // TODO: extract to a function
+        const auto & cparams = lctx.cparams;
+        const auto & model   = lctx.model;
+
+        // build worst-case graph
+        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
+
+        // initialize scheduler with the worst-case graph
+        ggml_backend_sched_reset(lctx.sched.get());
+        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+        }
+
+        lctx.need_reserve = false;
+    }
+
     ggml_backend_sched_reset(lctx.sched.get());
     ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
@@ -8419,6 +8417,57 @@ int64_t llama_time_us(void) {
     return ggml_time_us();
 }
 
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
+static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+    // loading time will be recalculated after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = 0;
+    time_meas tm(model.t_load_us);
+
+    model.t_start_us = tm.t_start_us;
+
+    try {
+        llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
+
+        ml.print_info();
+
+        model.hparams.vocab_only = params.vocab_only;
+
+        try {
+            model.load_arch(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
+        }
+        try {
+            model.load_hparams(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
+        }
+        try {
+            model.load_vocab(ml);
+        } catch(const std::exception & e) {
+            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
+        }
+
+        model.load_stats(ml);
+        model.print_info();
+
+        if (params.vocab_only) {
+            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+            return 0;
+        }
+
+        if (!model.load_tensors(ml)) {
+            return -2;
+        }
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+        return -1;
+    }
+
+    return 0;
+}
+
 static struct llama_model * llama_model_load_from_file_impl(
         const std::string & path_model,
         std::vector<std::string> & splits,
@@ -8889,192 +8938,6 @@ struct llama_context * llama_new_context_with_model(
     return llama_init_from_model(model, params);
 }
 
-//
-// kv cache view
-//
-
-struct llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
-    return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
-}
-
-void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
-    llama_kv_cache_view_update(view, ctx->kv_self);
-}
-
-//
-// kv cache
-//
-
-// deprecated
-int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
-    return llama_kv_self_n_tokens(ctx);
-}
-
-int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
-    return llama_kv_cache_n_tokens(&ctx->kv_self);
-}
-
-// deprecated
-int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
-    return llama_kv_self_used_cells(ctx);
-}
-
-int32_t llama_kv_self_used_cells(const llama_context * ctx) {
-    return llama_kv_cache_used_cells(&ctx->kv_self);
-}
-
-// deprecated
-void llama_kv_cache_clear(llama_context * ctx) {
-    llama_kv_self_clear(ctx);
-}
-
-void llama_kv_self_clear(llama_context * ctx) {
-    llama_kv_cache_clear(&ctx->kv_self);
-}
-
-// deprecated
-bool llama_kv_cache_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
-}
-
-bool llama_kv_self_seq_rm(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_cache_seq_rm(&ctx->kv_self, seq_id, p0, p1);
-}
-
-// deprecated
-void llama_kv_cache_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
-}
-
-void llama_kv_self_seq_cp(
-        llama_context * ctx,
-         llama_seq_id   seq_id_src,
-         llama_seq_id   seq_id_dst,
-            llama_pos   p0,
-            llama_pos   p1) {
-    return llama_kv_cache_seq_cp(&ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
-}
-
-// deprecated
-void llama_kv_cache_seq_keep(
-        llama_context * ctx,
-         llama_seq_id   seq_id) {
-    return llama_kv_self_seq_keep(ctx, seq_id);
-}
-
-void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_keep(&ctx->kv_self, seq_id);
-}
-
-// deprecated
-void llama_kv_cache_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
-}
-
-void llama_kv_self_seq_add(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-            llama_pos   delta) {
-    return llama_kv_cache_seq_add(&ctx->kv_self, seq_id, p0, p1, delta);
-}
-
-// deprecated
-void llama_kv_cache_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
-}
-
-void llama_kv_self_seq_div(
-        llama_context * ctx,
-         llama_seq_id   seq_id,
-            llama_pos   p0,
-            llama_pos   p1,
-                  int   d) {
-    return llama_kv_cache_seq_div(&ctx->kv_self, seq_id, p0, p1, d);
-}
-
-// deprecated
-llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_self_seq_pos_max(ctx, seq_id);
-}
-
-llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
-    return llama_kv_cache_seq_pos_max(&ctx->kv_self, seq_id);
-}
-
-// deprecated
-void llama_kv_cache_defrag(llama_context * ctx) {
-    return llama_kv_self_defrag(ctx);
-}
-
-void llama_kv_self_defrag(llama_context * ctx) {
-    return llama_kv_cache_defrag(&ctx->kv_self);
-}
-
-// deprecated
-bool llama_kv_cache_can_shift(const llama_context * ctx) {
-    return llama_kv_self_can_shift(ctx);
-}
-
-bool llama_kv_self_can_shift(const llama_context * ctx) {
-    return llama_kv_cache_can_shift(&ctx->kv_self);
-}
-
-// deprecated
-void llama_kv_cache_update(llama_context * ctx) {
-    llama_kv_self_update(ctx);
-}
-
-void llama_kv_self_update(llama_context * ctx) {
-    const bool need_reserve = ctx->kv_self_update();
-
-    // reserve a worst case graph again
-    if (need_reserve) {
-        // TODO: extract to a function
-        const auto & cparams = ctx->cparams;
-        const auto & model   = ctx->model;
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(ctx->sched.get());
-        if (!ggml_backend_sched_reserve(ctx->sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-    }
-}
-
 ///
 
 int32_t llama_encode(

From cb8f2095c6f74d9fbb9bdfbb2ae1bf6178472150 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 19:37:52 +0200
Subject: [PATCH 16/28] wip

---
 src/llama.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 5807fa388..6c8df8a11 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7773,6 +7773,7 @@ static int llama_decode_impl(
     llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
 
     const llama_batch & batch = batch_allocr.batch;
+
     const uint32_t n_tokens_all = batch.n_tokens;
 
     const auto & model   = lctx.model;
@@ -7800,9 +7801,6 @@ static int llama_decode_impl(
     }
     lctx.n_queued_tokens += n_tokens_all;
 
-    auto & kv_self = lctx.kv_self;
-    llama_kv_slot_restorer kv_slot_restorer(kv_self);
-
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = vocab.n_tokens();
 
@@ -7828,16 +7826,19 @@ static int llama_decode_impl(
         n_outputs = 1;
     }
 
-    lctx.sbatch.from_batch(batch, n_embd,
-        /* simple_split */ !kv_self.recurrent,
-        /* logits_all   */ n_outputs == n_tokens_all);
-
     // reserve output buffer
     if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
         return -2;
     };
 
+    auto & kv_self = lctx.kv_self;
+    llama_kv_slot_restorer kv_slot_restorer(kv_self);
+
+    lctx.sbatch.from_batch(batch, n_embd,
+        /* simple_split */ !kv_self.recurrent,
+        /* logits_all   */ n_outputs == n_tokens_all);
+
     while (lctx.sbatch.n_tokens > 0) {
         llama_ubatch ubatch;
         if (kv_self.recurrent) {
@@ -8645,7 +8646,6 @@ struct llama_context * llama_init_from_model(
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
     cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
 
-    // this is necessary due to kv_self.n being padded later during inference
     cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams));
 
     // with causal attention, the batch size is limited by the context size

From 99422dfa3f0c686d89492958946a9b2ca91012da Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 20:30:16 +0200
Subject: [PATCH 17/28] context : introduce llama_batch_manager

ggml-ci
---
 src/llama-context.cpp | 130 ++++++++++++++++++++++++++++++++++++++++--
 src/llama-context.h   |  18 +++++-
 src/llama.cpp         |  87 ++++++----------------------
 3 files changed, 162 insertions(+), 73 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d696090cc..de54321df 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -32,6 +32,132 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+struct llama_batch_manager : public llama_batch_manager_i {
+    llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
+        const auto & hparams = lctx.model.hparams;
+        const auto & n_embd  = hparams.n_embd;
+
+        const auto & kv_self = lctx.kv_self;
+
+        lctx.sbatch.from_batch(batch, n_embd,
+                /* simple_split */ !kv_self.recurrent,
+                /* logits_all   */ logits_all);
+    }
+
+    ~llama_batch_manager() override {
+    }
+
+    virtual llama_ubatch next() override {
+        ubatch = llama_ubatch();
+
+        const auto & cparams = lctx.cparams;
+        const auto & kv_self = lctx.kv_self;
+
+        const auto & n_ubatch = cparams.n_ubatch;
+
+        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+        if (kv_self.recurrent) {
+            if (embd_pooled) {
+                // Pooled embeddings cannot be split across ubatches (yet)
+                ubatch = lctx.sbatch.split_seq(n_ubatch);
+            } else {
+                // recurrent model architectures are easier to implement
+                // with equal-length sequences
+                ubatch = lctx.sbatch.split_equal(n_ubatch);
+            }
+        } else {
+            ubatch = lctx.sbatch.split_simple(n_ubatch);
+        }
+
+        return ubatch;
+    }
+
+    virtual bool prepare() override {
+        const auto & cparams = lctx.cparams;
+        const auto & hparams = lctx.model.hparams;
+
+        auto & kv_self = lctx.kv_self;
+
+        // non-causal masks do not use the KV cache
+        if (hparams.causal_attn) {
+            llama_kv_self_update(&lctx);
+
+            // if we have enough unused cells before the current head ->
+            //   better to start searching from the beginning of the cache, hoping to fill it
+            if (kv_self.head > kv_self.used + 2*ubatch.n_tokens) {
+                kv_self.head = 0;
+            }
+
+            const auto slot_info = kv_self.find_slot(ubatch);
+            if (!slot_info) {
+                return false;
+            }
+
+            kv_slot_restorer.save(slot_info);
+
+            if (!kv_self.recurrent) {
+                // a heuristic, to avoid attending the full cache if it is not yet utilized
+                // after enough generations, the benefit from this heuristic disappears
+                // if we start defragmenting the cache, the benefit from this will be more important
+                const uint32_t pad = kv_self.get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
+                //kv_self.n = llama_kv_cache_cell_max(kv_self);
+            }
+        }
+
+        return true;
+    }
+
+    virtual void restore() override {
+        kv_slot_restorer.restore(lctx.kv_self);
+    }
+
+    virtual void update() override {
+        auto & kv_self = lctx.kv_self;
+
+        // update the kv ring buffer
+        {
+            kv_self.head += ubatch.n_tokens;
+
+            // Ensure kv cache head points to a valid index.
+            if (kv_self.head >= kv_self.size) {
+                kv_self.head = 0;
+            }
+        }
+    }
+
+    virtual void finalize() override {
+        const auto & cparams = lctx.cparams;
+
+        auto & kv_self = lctx.kv_self;
+
+        // decide if we need to defrag the kv cache
+        if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
+            const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
+
+            // queue defragmentation for next llama_kv_cache_update
+            if (fragmentation > cparams.defrag_thold) {
+                //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
+
+                kv_self.defrag();
+            }
+        }
+    }
+
+    llama_context & lctx;
+
+    const llama_batch & batch;
+
+    llama_ubatch ubatch;
+
+    llama_kv_slot_restorer kv_slot_restorer;
+};
+
+std::unique_ptr<llama_batch_manager_i> llama_context::prepare_batch(const llama_batch & batch, bool logits_all) {
+    return std::make_unique<llama_batch_manager>(*this, batch, logits_all);
+}
+
 enum ggml_status llama_context::compute_graph(
             ggml_cgraph * graph,
                    bool   batched) {
@@ -59,7 +185,6 @@ enum ggml_status llama_context::compute_graph(
     return status;
 }
 
-
 llama_pos llama_context::pos_max() const {
     return kv_self.pos_max();
 }
@@ -94,9 +219,6 @@ void llama_context::prepare_k_shift() {
 void llama_context::prepare_defrag() {
 }
 
-void llama_context::prepare_decode(const llama_ubatch & /*ubatch*/) {
-}
-
 // llama input
 
 void llama_context::set_inputs(const llama_ubatch & ubatch) {
diff --git a/src/llama-context.h b/src/llama-context.h
index eb9a17391..47233f4f5 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -16,6 +16,20 @@
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
+// TODO: this is very WIP - improve
+struct llama_batch_manager_i {
+    virtual ~llama_batch_manager_i() = default;
+
+    //bool is_done() const;
+
+    virtual llama_ubatch next() = 0;
+
+    virtual bool prepare() = 0;
+    virtual void restore() = 0;
+    virtual void update() = 0;
+    virtual void finalize() = 0;
+};
+
 struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
@@ -80,6 +94,9 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
+    // TODO: do not pass logits_all explicitly
+    std::unique_ptr<llama_batch_manager_i> prepare_batch(const llama_batch & batch, bool logits_all);
+
     // returns the result of ggml_backend_sched_graph_compute_async execution
     enum ggml_status compute_graph(
                 ggml_cgraph * graph,
@@ -95,7 +112,6 @@ struct llama_context {
 
     void prepare_k_shift();
     void prepare_defrag();
-    void prepare_decode(const llama_ubatch & ubatch);
 
     void set_inputs(const llama_ubatch & ubatch);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 6c8df8a11..8f6de199a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7807,8 +7807,6 @@ static int llama_decode_impl(
     uint32_t n_outputs = 0;
     uint32_t n_outputs_prev = 0;
 
-    const auto n_ubatch = cparams.n_ubatch;
-
     // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
     const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
 
@@ -7832,27 +7830,19 @@ static int llama_decode_impl(
         return -2;
     };
 
-    auto & kv_self = lctx.kv_self;
-    llama_kv_slot_restorer kv_slot_restorer(kv_self);
+    const bool logits_all = n_outputs == n_tokens_all;
 
-    lctx.sbatch.from_batch(batch, n_embd,
-        /* simple_split */ !kv_self.recurrent,
-        /* logits_all   */ n_outputs == n_tokens_all);
+    //auto & kv_self = lctx.kv_self;
+    //llama_kv_slot_restorer kv_slot_restorer(kv_self);
+
+    //lctx.sbatch.from_batch(batch, n_embd,
+    //    /* simple_split */ !kv_self.recurrent,
+    //    /* logits_all   */ logits_all);
+
+    auto batch_manager = lctx.prepare_batch(batch, logits_all);
 
     while (lctx.sbatch.n_tokens > 0) {
-        llama_ubatch ubatch;
-        if (kv_self.recurrent) {
-            if (embd_pooled) {
-                // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = lctx.sbatch.split_seq(n_ubatch);
-            } else {
-                // recurrent model architectures are easier to implement
-                // with equal-length sequences
-                ubatch = lctx.sbatch.split_equal(n_ubatch);
-            }
-        } else {
-            ubatch = lctx.sbatch.split_simple(n_ubatch);
-        }
+        llama_ubatch ubatch = batch_manager->next();
 
         const uint32_t n_tokens = ubatch.n_tokens;
 
@@ -7873,32 +7863,10 @@ static int llama_decode_impl(
             lctx.n_outputs = n_outputs_new;
         }
 
-        lctx.prepare_decode(ubatch);
-
-        // non-causal masks do not use the KV cache
-        if (hparams.causal_attn) {
-            llama_kv_self_update(&lctx);
-
-            // if we have enough unused cells before the current head ->
-            //   better to start searching from the beginning of the cache, hoping to fill it
-            if (kv_self.head > kv_self.used + 2*n_tokens) {
-                kv_self.head = 0;
-            }
-
-            const auto slot_info = kv_self.find_slot(ubatch);
-            if (!slot_info) {
-                return 1;
-            }
-            kv_slot_restorer.save(slot_info);
-
-            if (!kv_self.recurrent) {
-                // a heuristic, to avoid attending the full cache if it is not yet utilized
-                // after enough generations, the benefit from this heuristic disappears
-                // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = kv_self.get_padding(cparams);
-                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(kv_self.cell_max(), pad)));
-                //kv_self.n = llama_kv_cache_cell_max(kv_self);
-            }
+        if (!batch_manager->prepare()) {
+            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
+            batch_manager->restore();
+            return -3;
         }
 
         // reserve a worst case graph if needed
@@ -7963,7 +7931,7 @@ static int llama_decode_impl(
 
         const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
-            kv_slot_restorer.restore(kv_self);
+            batch_manager->restore();
             switch (compute_status) {
                 case GGML_STATUS_ABORTED:
                     return 2;
@@ -7975,15 +7943,7 @@ static int llama_decode_impl(
             }
         }
 
-        // update the kv ring buffer
-        {
-            kv_self.head += n_tokens;
-
-            // Ensure kv cache head points to a valid index.
-            if (kv_self.head >= kv_self.size) {
-                kv_self.head = 0;
-            }
-        }
+        batch_manager->update();
 
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
@@ -8061,6 +8021,7 @@ static int llama_decode_impl(
                     }
             }
         }
+
         n_outputs_prev += lctx.n_outputs;
     }
 
@@ -8089,17 +8050,7 @@ static int llama_decode_impl(
     // wait for the computation to finish (automatically done when obtaining the model output)
     //llama_synchronize(&lctx);
 
-    // decide if we need to defrag the kv cache
-    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
-
-        // queue defragmentation for next llama_kv_cache_update
-        if (fragmentation > cparams.defrag_thold) {
-            //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
-
-            kv_self.defrag();
-        }
-    }
+    batch_manager->finalize();
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
@@ -8178,7 +8129,7 @@ static int llama_encode_impl(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    lctx.prepare_decode(ubatch);
+    //batch_manager->prepare(ubatch);
 
     // reserve a worst case graph if needed
     // TODO: extract to a function

From a0c500b4dc91b87acba2529d2db7a2d28f1c3bb6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 17 Jan 2025 21:11:03 +0200
Subject: [PATCH 18/28] context : prepare for abstraction

ggml-ci
---
 src/llama-context.cpp | 307 ++++++++++++++++++++++++++++++++++++++++-
 src/llama-context.h   |  11 +-
 src/llama.cpp         | 314 ++----------------------------------------
 3 files changed, 323 insertions(+), 309 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index de54321df..4e6033ff1 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -32,6 +32,309 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
+llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst) :
+    model(model),
+    t_start_us(model.t_start_us),
+    t_load_us (model.t_load_us) {
+
+    const auto & hparams = model.hparams;
+
+    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
+    cparams.n_threads        = params.n_threads;
+    cparams.n_threads_batch  = params.n_threads_batch;
+    cparams.yarn_ext_factor  = params.yarn_ext_factor;
+    cparams.yarn_attn_factor = params.yarn_attn_factor;
+    cparams.yarn_beta_fast   = params.yarn_beta_fast;
+    cparams.yarn_beta_slow   = params.yarn_beta_slow;
+    cparams.defrag_thold     = params.defrag_thold;
+    cparams.embeddings       = params.embeddings;
+    cparams.offload_kqv      = params.offload_kqv;
+    cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
+    cparams.pooling_type     = params.pooling_type;
+
+    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
+
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, get_ctx_padding(cparams));
+
+    // with causal attention, the batch size is limited by the context size
+    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
+
+    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
+    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
+    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
+    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
+        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
+        cparams.n_batch = GGML_KQ_MASK_PAD;
+    }
+
+    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+
+    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
+                                                              hparams.n_ctx_train;
+
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+
+    auto rope_scaling_type = params.rope_scaling_type;
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
+        rope_scaling_type = hparams.rope_scaling_type_train;
+    }
+
+    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
+        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
+    }
+
+    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
+        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
+    }
+
+    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
+
+    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
+            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+        } else {
+            cparams.pooling_type = hparams.pooling_type;
+        }
+    }
+
+    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+        cparams.causal_attn = hparams.causal_attn;
+    } else {
+        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+    }
+
+    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
+    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
+    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
+    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
+    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
+    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
+    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
+
+    if (n_ctx_per_seq < hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_train) {
+        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+                __func__, n_ctx_per_seq, hparams.n_ctx_train);
+    }
+
+    logits_all = params.logits_all;
+
+    // build worst-case graph for encoder if a model contains encoder
+    is_encoding = llama_model_has_encoder(&model); // TODO: model.has_encoder()
+
+    uint32_t kv_size = cparams.n_ctx;
+    ggml_type type_k = params.type_k;
+    ggml_type type_v = params.type_v;
+
+    // Mamba only needs a constant number of KV cache cells per sequence
+    if (llama_model_is_recurrent(&model)) {
+        // Mamba needs at least as many KV cells as there are sequences kept at any time
+        kv_size = std::max((uint32_t) 1, params.n_seq_max);
+        // it's probably best to keep as much precision as possible for the states
+        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
+    }
+
+    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
+    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
+
+    if (!hparams.vocab_only) {
+        // GPU backends
+        for (auto * dev : model.devices) {
+            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                throw std::runtime_error("failed to initialize backend");
+            }
+            backends.emplace_back(backend);
+        }
+
+        // add ACCEL backends (such as BLAS)
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+                    throw std::runtime_error("failed to initialize backend");
+                }
+                backends.emplace_back(backend);
+            }
+        }
+
+        // add CPU backend
+        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        if (backend_cpu == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
+            throw std::runtime_error("failed to initialize CPU backend");
+        }
+        backends.emplace_back(backend_cpu);
+
+        // create a list of the set_n_threads functions in the backends
+        for (auto & backend : backends) {
+            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
+            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+            if (reg) {
+                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+                if (ggml_backend_set_n_threads_fn) {
+                    set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
+                }
+            }
+        }
+
+        llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
+
+        if (!kv_self.init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
+            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
+            throw std::runtime_error("failed to initialize self-attention cache");
+        }
+
+        {
+            const size_t memory_size_k = kv_self.size_k_bytes();
+            const size_t memory_size_v = kv_self.size_v_bytes();
+
+            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+        }
+
+        // graph outputs buffer
+        {
+            // resized during inference when a batch uses more outputs
+            if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) {
+                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
+                throw std::runtime_error("failed to reserve initial output buffer");
+            }
+
+            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
+                    ggml_backend_buffer_name    (buf_output.get()),
+                    ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
+        }
+
+        // scheduler and compute buffers
+        {
+            // buffer types used for the compute buffer of each backend
+            std::vector<ggml_backend_buffer_type_t> backend_buft;
+            std::vector<ggml_backend_t> backend_ptrs;
+            for (auto & backend : backends) {
+                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
+                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
+                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
+                    auto * dev = model.devices[0];
+                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
+                    if (host_buft) {
+                        buft = host_buft;
+                    }
+                }
+                backend_buft.push_back(buft);
+                backend_ptrs.push_back(backend.get());
+            }
+
+            const size_t max_nodes = model.max_nodes();
+
+            // buffer used to store the computation graph and the tensor meta data
+            buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+
+            // TODO: move these checks to ggml_backend_sched
+            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
+            bool pipeline_parallel =
+                model.n_devices() > 1 &&
+                model.params.n_gpu_layers > (int) model.hparams.n_layer &&
+                model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+                params.offload_kqv;
+
+            // pipeline parallelism requires support for async compute and events in all devices
+            if (pipeline_parallel) {
+                for (auto & backend : backends) {
+                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                        // ignore CPU backend
+                        continue;
+                    }
+                    auto * dev = ggml_backend_get_device(backend.get());
+                    ggml_backend_dev_props props;
+                    ggml_backend_dev_get_props(dev, &props);
+                    if (!props.caps.async || !props.caps.events) {
+                        // device does not support async compute or events
+                        pipeline_parallel = false;
+                        break;
+                    }
+                }
+            }
+
+            sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+
+            if (pipeline_parallel) {
+                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
+            }
+
+            // initialize scheduler with the worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+
+            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+
+            // reserve pp graph first so that buffers are only allocated once
+            ggml_backend_sched_reserve(sched.get(), gf_pp);
+            int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
+
+            // reserve with tg graph to get the number of splits and nodes
+            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+            ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg);
+            ggml_backend_sched_reserve(sched.get(), gf_tg);
+            int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
+
+            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
+            gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+                throw std::runtime_error("failed to allocate compute buffers");
+            }
+
+            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+                ggml_backend_t backend = backend_ptrs[i];
+                ggml_backend_buffer_type_t buft = backend_buft[i];
+                size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+                if (size > 1) {
+                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                            ggml_backend_buft_name(buft),
+                            size / 1024.0 / 1024.0);
+                }
+            }
+
+            if (n_nodes_pp == n_nodes_tg) {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+            }
+            if (n_splits_pp == n_splits_tg) {
+                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+            } else {
+                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+            }
+        }
+    }
+
+}
+
 struct llama_batch_manager : public llama_batch_manager_i {
     llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
         const auto & hparams = lctx.model.hparams;
@@ -81,7 +384,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
         // non-causal masks do not use the KV cache
         if (hparams.causal_attn) {
-            llama_kv_self_update(&lctx);
+            lctx.kv_self_update();
 
             // if we have enough unused cells before the current head ->
             //   better to start searching from the beginning of the cache, hoping to fill it
@@ -106,6 +409,8 @@ struct llama_batch_manager : public llama_batch_manager_i {
             }
         }
 
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+
         return true;
     }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 47233f4f5..d0356e3ed 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -30,11 +30,14 @@ struct llama_batch_manager_i {
     virtual void finalize() = 0;
 };
 
+// TODO: make implementation details private
+// TODO: become abstract base class, split the current implementation into different child classes
 struct llama_context {
-    llama_context(const llama_model & model)
-        : model(model)
-        , t_start_us(model.t_start_us)
-        , t_load_us (model.t_load_us) {}
+    // TODO: store the worst-case graph build function and reuse it later
+    llama_context(
+            const llama_model & model,
+            const llama_context_params & params,
+            std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst);
 
     const struct llama_model & model;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 8f6de199a..408bd9030 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7893,8 +7893,6 @@ static int llama_decode_impl(
             lctx.need_reserve = false;
         }
 
-        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
-
         ggml_backend_sched_reset(lctx.sched.get());
         ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
@@ -8574,309 +8572,17 @@ struct llama_context * llama_init_from_model(
         return nullptr;
     }
 
-    llama_context * ctx = new llama_context(*model);
+    llama_context * ctx = nullptr;
 
-    const auto & hparams = model->hparams;
-    auto       & cparams = ctx->cparams;
-
-    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
-    cparams.n_threads        = params.n_threads;
-    cparams.n_threads_batch  = params.n_threads_batch;
-    cparams.yarn_ext_factor  = params.yarn_ext_factor;
-    cparams.yarn_attn_factor = params.yarn_attn_factor;
-    cparams.yarn_beta_fast   = params.yarn_beta_fast;
-    cparams.yarn_beta_slow   = params.yarn_beta_slow;
-    cparams.defrag_thold     = params.defrag_thold;
-    cparams.embeddings       = params.embeddings;
-    cparams.offload_kqv      = params.offload_kqv;
-    cparams.flash_attn       = params.flash_attn;
-    cparams.no_perf          = params.no_perf;
-    cparams.pooling_type     = params.pooling_type;
-
-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, ctx->get_ctx_padding(cparams));
-
-    // with causal attention, the batch size is limited by the context size
-    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
-
-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
-        cparams.n_batch = GGML_KQ_MASK_PAD;
-    }
-
-    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
-
-    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
-                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
-                                                              hparams.n_ctx_train;
-
-    cparams.cb_eval           = params.cb_eval;
-    cparams.cb_eval_user_data = params.cb_eval_user_data;
-
-    auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
-        rope_scaling_type = hparams.rope_scaling_type_train;
-    }
-
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
-        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
-    }
-
-    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
-    }
-
-    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
-
-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
-            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
-        } else {
-            cparams.pooling_type = hparams.pooling_type;
-        }
-    }
-
-    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
-        cparams.causal_attn = hparams.causal_attn;
-    } else {
-        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
-    }
-
-    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
-    LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
-    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
-    LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
-    LLAMA_LOG_INFO("%s: n_batch       = %u\n",   __func__, cparams.n_batch);
-    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",   __func__, cparams.n_ubatch);
-    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
-    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
-    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
-
-    if (n_ctx_per_seq < hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
-    }
-
-    if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
-                __func__, n_ctx_per_seq, hparams.n_ctx_train);
-    }
-
-    ctx->logits_all = params.logits_all;
-
-    // build worst-case graph for encoder if a model contains encoder
-    ctx->is_encoding = llama_model_has_encoder(model);
-
-    uint32_t kv_size = cparams.n_ctx;
-    ggml_type type_k = params.type_k;
-    ggml_type type_v = params.type_v;
-
-    // Mamba only needs a constant number of KV cache cells per sequence
-    if (llama_model_is_recurrent(model)) {
-        // Mamba needs at least as many KV cells as there are sequences kept at any time
-        kv_size = std::max((uint32_t) 1, params.n_seq_max);
-        // it's probably best to keep as much precision as possible for the states
-        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
-        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
-    }
-
-    GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
-    GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
-
-    if (!hparams.vocab_only) {
-        // GPU backends
-        for (auto * dev : model->devices) {
-            ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.emplace_back(backend);
-        }
-
-        // add ACCEL backends (such as BLAS)
-        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
-                ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.emplace_back(backend);
-            }
-        }
-
-        // add CPU backend
-        ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-        if (ctx->backend_cpu == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-        ctx->backends.emplace_back(ctx->backend_cpu);
-
-        // create a list of the set_n_threads functions in the backends
-        for (auto & backend : ctx->backends) {
-            ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
-            ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
-            if (reg) {
-                auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
-                if (ggml_backend_set_n_threads_fn) {
-                    ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
-                }
-            }
-        }
-
-        llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
-
-        if (!ctx->kv_self.init(ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
-            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-
-        {
-            const size_t memory_size_k = ctx->kv_self.size_k_bytes();
-            const size_t memory_size_v = ctx->kv_self.size_v_bytes();
-
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-        }
-
-        // graph outputs buffer
-        {
-            // resized during inference when a batch uses more outputs
-            if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
-                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-
-            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
-                    ggml_backend_buffer_name(ctx->buf_output.get()),
-                    ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0);
-        }
-
-        // scheduler and compute buffers
-        {
-            // buffer types used for the compute buffer of each backend
-            std::vector<ggml_backend_buffer_type_t> backend_buft;
-            std::vector<ggml_backend_t> backend_ptrs;
-            for (auto & backend : ctx->backends) {
-                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
-                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
-                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
-                    auto * dev = model->devices[0];
-                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
-                    if (host_buft) {
-                        buft = host_buft;
-                    }
-                }
-                backend_buft.push_back(buft);
-                backend_ptrs.push_back(backend.get());
-            }
-
-            const size_t max_nodes = model->max_nodes();
-
-            // buffer used to store the computation graph and the tensor meta data
-            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
-
-            // TODO: move these checks to ggml_backend_sched
-            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
-            bool pipeline_parallel =
-                model->n_devices() > 1 &&
-                model->params.n_gpu_layers > (int)model->hparams.n_layer &&
-                model->params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
-                params.offload_kqv;
-
-            // pipeline parallelism requires support for async compute and events in all devices
-            if (pipeline_parallel) {
-                for (auto & backend : ctx->backends) {
-                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
-                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
-                        // ignore CPU backend
-                        continue;
-                    }
-                    auto * dev = ggml_backend_get_device(backend.get());
-                    ggml_backend_dev_props props;
-                    ggml_backend_dev_get_props(dev, &props);
-                    if (!props.caps.async || !props.caps.events) {
-                        // device does not support async compute or events
-                        pipeline_parallel = false;
-                        break;
-                    }
-                }
-            }
-
-            ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
-
-            if (pipeline_parallel) {
-                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get()));
-            }
-
-            // initialize scheduler with the worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-            llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-
-            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
-
-            // reserve pp graph first so that buffers are only allocated once
-            ggml_backend_sched_reserve(ctx->sched.get(), gf_pp);
-            int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get());
-            int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
-
-            // reserve with tg graph to get the number of splits and nodes
-            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
-            ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
-            int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());
-            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
-
-            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
-            if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-
-            for (size_t i = 0; i < backend_ptrs.size(); ++i) {
-                ggml_backend_t backend = backend_ptrs[i];
-                ggml_backend_buffer_type_t buft = backend_buft[i];
-                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend);
-                if (size > 1) {
-                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
-                            ggml_backend_buft_name(buft),
-                            size / 1024.0 / 1024.0);
-                }
-            }
-
-            if (n_nodes_pp == n_nodes_tg) {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, n_nodes_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph nodes  = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
-            }
-            if (n_splits_pp == n_splits_tg) {
-                LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
-            } else {
-                LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
-            }
-        }
+    try {
+        // TODO: add logic which llama_context implementation to construct
+        ctx = new llama_context(*model, params,
+                [](llama_context & lctx, const llama_ubatch & ubatch) {
+                    return llama_build_graph(lctx, ubatch, true);
+                });
+    } catch (const std::exception & e) {
+        LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());
+        return nullptr;
     }
 
     return ctx;

From 918885697e4409208b8157ffd18a6c347ca5b04d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 29 Jan 2025 14:45:04 +0200
Subject: [PATCH 19/28] llama : resolve rwkv conflict

ggml-ci
---
 src/llama.cpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index f410f7a2f..0ca8070cd 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7076,19 +7076,13 @@ struct llm_build_context {
     //            1
     //        );
 
+    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
     //        ggml_build_forward_expand(
     //            gf,
     //            ggml_cpy(
     //                ctx0,
-    //                wkv_states,
-    //                ggml_view_1d(
-    //                    ctx0,
-    //                    kv_self.v_l[il],
-    //                    hparams.n_embd_v_s() * n_seqs,
-    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-    //                )
-    //            )
-    //        );
+    //                ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
+    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
 
     //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
     //        ggml_build_forward_expand(gf, ffn_inp);

From 3e23be7911704f8474e7dcb32424bb043be63b06 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 2 Feb 2025 10:17:42 +0200
Subject: [PATCH 20/28] context : store graph build function callback

ggml-ci
---
 src/llama-context.cpp | 37 +++++++++++++++++++++++++++++++++----
 src/llama-context.h   |  8 ++++++--
 src/llama.cpp         |  4 ++--
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 37e43213a..1cd168db2 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -33,8 +33,12 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t
     return relative_bucket;
 }
 
-llama_context::llama_context(const llama_model & model, const llama_context_params & params, std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst) :
+llama_context::llama_context(
+        const llama_model & model,
+        const llama_context_params & params,
+        build_graph_callback && cb_build_graph) :
     model(model),
+    cb_build_graph(std::move(cb_build_graph)),
     t_start_us(model.t_start_us),
     t_load_us (model.t_load_us) {
 
@@ -289,7 +293,7 @@ llama_context::llama_context(const llama_model & model, const llama_context_para
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
 
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+            ggml_cgraph * gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
 
             // reserve pp graph first so that buffers are only allocated once
             ggml_backend_sched_reserve(sched.get(), gf_pp);
@@ -298,13 +302,13 @@ llama_context::llama_context(const llama_model & model, const llama_context_para
 
             // reserve with tg graph to get the number of splits and nodes
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = fn_build_graph_worst(*this, ubatch_tg);
+            ggml_cgraph * gf_tg = this->cb_build_graph(*this, ubatch_tg, true);
             ggml_backend_sched_reserve(sched.get(), gf_tg);
             int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
             int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
 
             // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = fn_build_graph_worst(*this, ubatch_pp);
+            gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                 throw std::runtime_error("failed to allocate compute buffers");
@@ -475,6 +479,31 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
+        // reserve a worst case graph if needed
+        if (lctx.need_reserve) {
+            LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
+
+            const auto & cparams = lctx.cparams;
+            const auto & model   = lctx.model;
+
+            // build worst-case graph
+            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
+            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
+
+            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
+
+            ggml_cgraph * gf = lctx.cb_build_graph(lctx, ubatch, true);
+
+            // initialize scheduler with the worst-case graph
+            ggml_backend_sched_reset(lctx.sched.get());
+            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
+                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
+            }
+
+            lctx.need_reserve = false;
+        }
+
         return true;
     }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 1277645de..5958deaef 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -36,11 +36,13 @@ struct llama_batch_manager_i {
 // TODO: make implementation details private
 // TODO: become abstract base class, split the current implementation into different child classes
 struct llama_context {
-    // TODO: store the worst-case graph build function and reuse it later
+    // TODO: tmp until llama-model starts implementing the graph build function
+    typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
+
     llama_context(
             const llama_model & model,
             const llama_context_params & params,
-            std::function<ggml_cgraph *(llama_context &, const llama_ubatch &)> fn_build_graph_worst);
+            build_graph_callback && cb_build_graph);
 
     const struct llama_model & model;
 
@@ -49,6 +51,8 @@ struct llama_context {
     llama_adapter_cvec cvec;
     llama_loras        loras;
 
+    build_graph_callback cb_build_graph;
+
     std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 0ca8070cd..6268249f2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8508,8 +8508,8 @@ struct llama_context * llama_init_from_model(
     try {
         // TODO: add logic which llama_context implementation to construct
         ctx = new llama_context(*model, params,
-                [](llama_context & lctx, const llama_ubatch & ubatch) {
-                    return llama_build_graph(lctx, ubatch, true);
+                [](llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) {
+                    return llama_build_graph(lctx, ubatch, worst_case);
                 });
     } catch (const std::exception & e) {
         LLAMA_LOG_ERROR("%s: failed to initialize context: %s\n", __func__, e.what());

From 1eca8916b51a6952a304e68f312b63649a6cead9 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Mon, 3 Feb 2025 20:17:50 +0800
Subject: [PATCH 21/28] llama : fix rwkv inference (#11618)

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 src/llama-context.cpp | 222 ++++++++++++++++++
 src/llama-context.h   |  27 +++
 src/llama.cpp         | 509 +++++++++++++-----------------------------
 3 files changed, 409 insertions(+), 349 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 1cd168db2..3bc0513ca 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1970,6 +1970,228 @@ ggml_tensor * llama_context::build_mamba_layer(
 }
 
 
+ggml_tensor * llama_context::build_rwkv_token_shift_load(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto token_shift_count = hparams.token_shift_count;
+
+    const auto & n_tokens = ubatch.n_tokens;
+    const int64_t n_seqs  = ubatch.n_seqs;
+
+    struct ggml_tensor * token_shift_all = kv_self.k_l[il];
+
+    struct ggml_tensor * token_shift = build_copy_mask_state(
+            ctx0, graph, token_shift_all, state_copy, state_mask,
+            n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
+
+    token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
+
+    return token_shift;
+}
+
+
+ggml_tensor * llama_context::build_rwkv_token_shift_store(
+        ggml_context * ctx0,
+         ggml_tensor * token_shift,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto token_shift_count = hparams.token_shift_count;
+    const auto n_embd = hparams.n_embd;
+
+    const auto & n_tokens = ubatch.n_tokens;
+    const int64_t n_seqs  = ubatch.n_seqs;
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    return ggml_cpy(
+        ctx0,
+        ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
+        ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+    );
+}
+
+
+ggml_tensor * llama_context::build_rwkv6_time_mix(
+        ggml_context * ctx0,
+         ggml_cgraph * graph,
+         ggml_tensor * cur,
+         ggml_tensor * x_prev,
+         ggml_tensor * state_copy,
+         ggml_tensor * state_mask,
+  const llama_ubatch & ubatch,
+                 int   il,
+                bool   worst_case) {
+    const auto & hparams = model.hparams;
+
+    const auto n_tokens = ubatch.n_tokens;
+    const auto n_seqs = ubatch.n_seqs;
+    const auto n_embd = hparams.n_embd;
+    const auto head_size = hparams.wkv_head_size;
+    const auto n_head = n_embd / head_size;
+    const auto n_head_kv = hparams.n_head_kv(il);
+
+    const auto kv_head = worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head;
+
+    const auto layer = &model.layers[il];
+
+    bool is_qrwkv = layer->time_mix_first == nullptr;
+
+    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
+
+    xxx = ggml_reshape_4d(
+        ctx0,
+        ggml_tanh(
+            ctx0,
+            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
+        ),
+        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
+    );
+
+    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
+
+    xxx = ggml_mul_mat(
+        ctx0,
+        ggml_reshape_4d(
+            ctx0,
+            layer->time_mix_w2,
+            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
+        ),
+        xxx
+    );
+
+    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
+    if (layer->time_mix_lerp_fused) {
+        // fusing these weights makes some performance improvement
+        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
+        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
+        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+    } else {
+        // for backward compatibility
+        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+
+        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
+        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
+        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
+        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
+        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
+    }
+
+    struct ggml_tensor * r = build_lora_mm(ctx0, layer->time_mix_receptance, xr);
+    struct ggml_tensor * k = build_lora_mm(ctx0, layer->time_mix_key,        xk);
+    struct ggml_tensor * v = build_lora_mm(ctx0, layer->time_mix_value,      xv);
+    if (layer->time_mix_receptance_b) {
+        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
+    }
+    if (layer->time_mix_key_b) {
+        k = ggml_add(ctx0, k, layer->time_mix_key_b);
+    }
+    if (layer->time_mix_value_b) {
+        v = ggml_add(ctx0, v, layer->time_mix_value_b);
+    }
+
+    struct ggml_tensor * g = build_lora_mm(ctx0, layer->time_mix_gate, xg);
+    if (is_qrwkv) {
+        g = ggml_sigmoid(ctx0, g);
+    } else {
+        g = ggml_silu(ctx0, g);
+    }
+
+    if (n_head_kv != 0 && n_head_kv != n_head) {
+        GGML_ASSERT(n_head % n_head_kv == 0);
+        k = ggml_reshape_4d(ctx0, k, head_size, 1, n_head_kv, n_tokens);
+        v = ggml_reshape_4d(ctx0, v, head_size, 1, n_head_kv, n_tokens);
+        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, n_head / n_head_kv, n_head_kv, n_tokens);
+        k = ggml_repeat(ctx0, k, tmp);
+        v = ggml_repeat(ctx0, v, tmp);
+    }
+
+    k = ggml_reshape_3d(ctx0, k, head_size, n_head, n_tokens);
+    v = ggml_reshape_3d(ctx0, v, head_size, n_head, n_tokens);
+    r = ggml_reshape_3d(ctx0, r, head_size, n_head, n_tokens);
+
+    struct ggml_tensor * w = ggml_mul_mat(
+        ctx0,
+        layer->time_mix_decay_w2,
+        ggml_tanh(
+            ctx0,
+            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
+        )
+    );
+
+    w = ggml_add(ctx0, w, layer->time_mix_decay);
+    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
+    w = ggml_reshape_3d(ctx0, w, head_size, n_head, n_tokens);
+
+    if (is_qrwkv) {
+        // k = k * (1 - w)
+        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
+    }
+
+    struct ggml_tensor * wkv_state = build_copy_mask_state(
+            ctx0, graph, kv_self.v_l[il], state_copy, state_mask,
+            n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
+
+    struct ggml_tensor * wkv_output;
+    if (is_qrwkv) {
+        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, wkv_state, pow(head_size, -0.5f));
+    } else {
+        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, wkv_state);
+    }
+    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
+    wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
+
+    ggml_build_forward_expand(
+        graph,
+        ggml_cpy(
+            ctx0,
+            wkv_state,
+            ggml_view_1d(
+                ctx0,
+                kv_self.v_l[il],
+                hparams.n_embd_v_s() * n_seqs,
+                hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+            )
+        )
+    );
+
+    if (!is_qrwkv) {
+        // group norm with head_count groups
+        cur = ggml_reshape_3d(ctx0, cur, n_embd / n_head, n_head, n_tokens);
+        cur = ggml_norm(ctx0, cur, 64e-5f);
+
+        // Convert back to regular vectors.
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+    } else {
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+    }
+
+    cur = ggml_mul(ctx0, cur, g);
+    cur = build_lora_mm(ctx0, layer->time_mix_output, cur);
+
+    return cur;
+}
+
 // llama output
 
 size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
diff --git a/src/llama-context.h b/src/llama-context.h
index 5958deaef..4cf4a6312 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -248,6 +248,33 @@ struct llama_context {
                      int   il,
                     bool   worst_case);
 
+    ggml_tensor * build_rwkv_token_shift_load(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
+    ggml_tensor * build_rwkv_token_shift_store(
+            ggml_context * ctx0,
+             ggml_tensor * token_shift,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
+    ggml_tensor * build_rwkv6_time_mix(
+            ggml_context * ctx0,
+             ggml_cgraph * graph,
+             ggml_tensor * cur,
+             ggml_tensor * x_prev,
+             ggml_tensor * state_copy,
+             ggml_tensor * state_mask,
+      const llama_ubatch & ubatch,
+                     int   il,
+                    bool   worst_case);
+
     struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
     struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 64a5efd2d..171ea2017 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -574,175 +574,34 @@ struct llm_build_context {
         return cur;
     }
 
-    //struct ggml_tensor * build_rwkv6_time_mix(
-    //        const struct llama_layer * layer,
-    //        struct ggml_tensor * cur,
-    //        struct ggml_tensor * x_prev,
-    //        struct ggml_tensor ** wkv_state,
-    //        size_t wkv_head_size,
-    //        size_t head_count_kv) {
-    //    size_t n_embd       = cur->ne[0];
-    //    size_t n_seq_tokens = cur->ne[1];
-    //    size_t n_seqs       = cur->ne[2];
+    struct ggml_tensor * build_rwkv_channel_mix(
+        const struct llama_layer * layer,
+        struct ggml_tensor * cur,
+        struct ggml_tensor * x_prev,
+        const llm_arch arch) {
+        struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
+        switch (arch) {
+            case LLM_ARCH_RWKV6:
+            {
+                struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
+                struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
 
-    //    size_t head_size  = wkv_head_size;
-    //    size_t head_count = n_embd / head_size;
+                struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
+                struct ggml_tensor * k = ggml_sqr(
+                        ctx0,
+                        ggml_relu(
+                            ctx0,
+                            build_lora_mm(layer->channel_mix_key, xk)
+                            )
+                    );
+                cur = ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
+            } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
 
-    //    size_t n_tokens = n_seqs * n_seq_tokens;
-
-    //    bool is_qrwkv = layer->time_mix_first == nullptr;
-
-    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-
-    //    sx  = ggml_reshape_2d(ctx0, sx,  n_embd, n_tokens);
-    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-
-    //    struct ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->time_mix_lerp_x), cur);
-
-    //    xxx = ggml_reshape_4d(
-    //        ctx0,
-    //        ggml_tanh(
-    //            ctx0,
-    //            ggml_mul_mat(ctx0, layer->time_mix_w1, xxx)
-    //        ),
-    //        layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
-    //    );
-
-    //    xxx = ggml_cont(ctx0, ggml_permute(ctx0, xxx, 0, 1, 3, 2));
-
-    //    xxx = ggml_mul_mat(
-    //        ctx0,
-    //        ggml_reshape_4d(
-    //            ctx0,
-    //            layer->time_mix_w2,
-    //            layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
-    //        ),
-    //        xxx
-    //    );
-
-    //    struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
-    //    if (layer->time_mix_lerp_fused) {
-    //        // fusing these weights makes some performance improvement
-    //        sx  = ggml_reshape_3d(ctx0, sx,  n_embd, 1, n_tokens);
-    //        cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
-    //        xxx = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xxx, layer->time_mix_lerp_fused), sx), cur);
-    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-    //    } else {
-    //        // for backward compatibility
-    //        xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0);
-    //        xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
-    //        xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
-    //        xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
-    //        xg = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-
-    //        xw = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xw, layer->time_mix_lerp_w), sx), cur);
-    //        xk = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xk, layer->time_mix_lerp_k), sx), cur);
-    //        xv = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xv, layer->time_mix_lerp_v), sx), cur);
-    //        xr = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xr, layer->time_mix_lerp_r), sx), cur);
-    //        xg = ggml_add(ctx0, ggml_mul(ctx0, ggml_add(ctx0, xg, layer->time_mix_lerp_g), sx), cur);
-    //    }
-
-    //    struct ggml_tensor * r = build_lora_mm(layer->time_mix_receptance, xr);
-    //    struct ggml_tensor * k = build_lora_mm(layer->time_mix_key,        xk);
-    //    struct ggml_tensor * v = build_lora_mm(layer->time_mix_value,      xv);
-    //    if (layer->time_mix_receptance_b) {
-    //        r = ggml_add(ctx0, r, layer->time_mix_receptance_b);
-    //    }
-    //    if (layer->time_mix_key_b) {
-    //        k = ggml_add(ctx0, k, layer->time_mix_key_b);
-    //    }
-    //    if (layer->time_mix_value_b) {
-    //        v = ggml_add(ctx0, v, layer->time_mix_value_b);
-    //    }
-
-    //    struct ggml_tensor * g = build_lora_mm(layer->time_mix_gate, xg);
-    //    if (is_qrwkv) {
-    //        g = ggml_sigmoid(ctx0, g);
-    //    } else {
-    //        g = ggml_silu(ctx0, g);
-    //    }
-
-    //    if (head_count_kv != head_count) {
-    //        GGML_ASSERT(head_count % head_count_kv == 0);
-    //        k = ggml_reshape_4d(ctx0, k, head_size, 1, head_count_kv, n_tokens);
-    //        v = ggml_reshape_4d(ctx0, v, head_size, 1, head_count_kv, n_tokens);
-    //        struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
-    //        k = ggml_repeat(ctx0, k, tmp);
-    //        v = ggml_repeat(ctx0, v, tmp);
-    //    }
-
-    //    k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens);
-    //    v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
-    //    r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens);
-
-    //    struct ggml_tensor * w = ggml_mul_mat(
-    //        ctx0,
-    //        layer->time_mix_decay_w2,
-    //        ggml_tanh(
-    //            ctx0,
-    //            ggml_mul_mat(ctx0, layer->time_mix_decay_w1, xw)
-    //        )
-    //    );
-
-    //    w = ggml_add(ctx0, w, layer->time_mix_decay);
-    //    w = ggml_exp(ctx0, ggml_neg(ctx0, ggml_exp(ctx0, w)));
-    //    w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens);
-
-    //    if (is_qrwkv) {
-    //        // k = k * (1 - w)
-    //        k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
-    //    }
-
-    //    struct ggml_tensor * wkv_output;
-    //    if (!layer->time_mix_first) {
-    //        wkv_output = ggml_gated_linear_attn(ctx0, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
-    //    } else {
-    //        wkv_output = ggml_rwkv_wkv6(ctx0, k, v, r, layer->time_mix_first, w, *wkv_state);
-    //    }
-    //    cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0);
-    //    *wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
-
-    //    if (!is_qrwkv) {
-    //        // group norm with head_count groups
-    //        cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens);
-    //        cur = ggml_norm(ctx0, cur, 64e-5f);
-
-    //        // Convert back to regular vectors.
-    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer->time_mix_ln), layer->time_mix_ln_b);
-    //    } else {
-    //        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //    }
-
-    //    cur = ggml_mul(ctx0, cur, g);
-    //    cur = build_lora_mm(layer->time_mix_output, cur);
-
-    //    return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs);
-    //}
-
-    //struct ggml_tensor * build_rwkv6_channel_mix(
-    //    const struct llama_layer * layer,
-    //    struct ggml_tensor * cur,
-    //    struct ggml_tensor * x_prev) {
-    //    struct ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur);
-    //    struct ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur);
-    //    struct ggml_tensor * xr = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_r), cur);
-
-    //    struct ggml_tensor * r = ggml_sigmoid(ctx0, build_lora_mm(layer->channel_mix_receptance, xr));
-    //    struct ggml_tensor * k = ggml_sqr(
-    //            ctx0,
-    //            ggml_relu(
-    //                ctx0,
-    //                build_lora_mm(layer->channel_mix_key, xk)
-    //                )
-    //            );
-
-    //    return ggml_mul(ctx0, r, build_lora_mm(layer->channel_mix_value, k));
-    //}
+        return cur;
+    }
 
     struct ggml_cgraph * build_k_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
@@ -6935,226 +6794,178 @@ struct llm_build_context {
         return gf;
     }
 
-    //ggml_cgraph * build_rwkv6() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    ggml_cgraph * build_rwkv6() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-    //    // Token shift state dimensions should be 2 * n_emb
-    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
+        GGML_ASSERT(hparams.token_shift_count == 2);
 
-    //    const int64_t n_seqs = ubatch.n_seqs;
-    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-    //    const int64_t n_tokens = ubatch.n_tokens;
-    //    GGML_ASSERT(n_seqs != 0);
-    //    GGML_ASSERT(ubatch.equal_seqs);
-    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-    //    struct ggml_tensor * state_copy = build_inp_s_copy();
-    //    struct ggml_tensor * state_mask = build_inp_s_mask();
+        inpL = build_inp_embd(model.tok_embd);
+        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-    //    inpL = build_inp_embd(model.tok_embd);
-    //    inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
+        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
 
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        const llama_layer * layer = &model.layers[il];
+        const auto n_embd = hparams.n_embd;
+        const auto n_seq_tokens = ubatch.n_seq_tokens;
+        const auto n_seqs = ubatch.n_seqs;
 
-    //        // (ab)using the KV cache to store the states
-    //        struct ggml_tensor * token_shift = build_copy_mask_state(
-    //                gf, kv_self.k_l[il], state_copy, state_mask,
-    //                hparams.n_embd_k_s(), n_seqs);
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
 
-    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
-    //                gf, kv_self.v_l[il], state_copy, state_mask,
-    //                hparams.n_embd_v_s(), n_seqs);
+            struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load(
+                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+            );
 
-    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
+            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
+            struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
 
-    //        struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
-    //        struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
+            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
+            cb(att_norm, "attn_norm", il);
 
-    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il);
-    //        struct ggml_tensor * x_prev = ggml_concat(
-    //            ctx0,
-    //            att_shift,
-    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-    //            1
-    //        );
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                att_shift,
+                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+                1
+            );
 
-    //        cur = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
-    //        ggml_build_forward_expand(gf, cur);
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                wkv_states,
-    //                ggml_view_1d(
-    //                    ctx0,
-    //                    kv_self.v_l[il],
-    //                    hparams.n_embd_v_s() * n_seqs,
-    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-    //                )
-    //            )
-    //        );
+            cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
-    //        struct ggml_tensor * x_norm_ffn = build_norm(cur, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
-    //        x_prev = ggml_concat(
-    //            ctx0,
-    //            ffn_shift,
-    //            ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
-    //            1
-    //        );
-    //        cur = ggml_add(ctx0, cur, build_rwkv6_channel_mix(layer, x_norm_ffn, x_prev));
-    //        ggml_build_forward_expand(gf, cur);
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
 
-    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
-    //        struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
+            struct ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il);
+            cb(ffn_norm, "ffn_norm", il);
 
-    //        token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
+            x_prev = ggml_concat(
+                ctx0,
+                ffn_shift,
+                ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0),
+                1
+            );
 
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
-    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
-    //            )
-    //        );
+            cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
+            cur = ggml_add(ctx0, cur, ffn_inp);
 
-    //        if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
-    //            cur = ggml_scale(ctx0, cur, 0.5F);
-    //        }
+            token_shift = ggml_concat(ctx0, 
+                ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
+                ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
+                1
+            );
+            ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
 
-    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
-    //        cb(cur, "l_out", il);
+            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
+                cur = ggml_scale(ctx0, cur, 0.5F);
+            }
 
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
 
-    //    cur = inpL;
-    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+            // input for next layer
+            inpL = cur;
+        }
 
-    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
-    //    cb(cur, "result_norm", -1);
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
-    //    cur = build_lora_mm(model.output, cur);
-    //    cb(cur, "result_output", -1);
+        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+        cb(cur, "result_norm", -1);
 
-    //    ggml_build_forward_expand(gf, cur);
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
 
-    //    return gf;
-    //}
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 
     // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
-    //ggml_cgraph * build_rwkv6qwen2() {
-    //    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+    ggml_cgraph * build_rwkv6qwen2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-    //    GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+        GGML_ASSERT(n_embd == hparams.n_embd_k_s());
 
-    //    const int64_t n_seqs = ubatch.n_seqs;
-    //    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-    //    const int64_t n_tokens = ubatch.n_tokens;
-    //    GGML_ASSERT(n_seqs != 0);
-    //    GGML_ASSERT(ubatch.equal_seqs);
-    //    GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
 
-    //    struct ggml_tensor * cur;
-    //    struct ggml_tensor * inpL;
-    //    struct ggml_tensor * state_copy = build_inp_s_copy();
-    //    struct ggml_tensor * state_mask = build_inp_s_mask();
+        inpL = build_inp_embd(model.tok_embd);
 
-    //    inpL = build_inp_embd(model.tok_embd);
+        struct ggml_tensor * state_copy = lctx.build_inp_s_copy(ctx0, worst_case);
+        struct ggml_tensor * state_mask = lctx.build_inp_s_mask(ctx0, worst_case);
 
-    //    for (int il = 0; il < n_layer; ++il) {
-    //        const llama_layer * layer = &model.layers[il];
+        const auto n_embd = hparams.n_embd;
+        const auto n_seq_tokens = ubatch.n_seq_tokens;
+        const auto n_seqs = ubatch.n_seqs;
 
-    //        // (ab)using the KV cache to store the states
-    //        struct ggml_tensor * token_shift = build_copy_mask_state(
-    //                gf, kv_self.k_l[il], state_copy, state_mask,
-    //                hparams.n_embd_k_s(), n_seqs);
+        inpL = build_inp_embd(model.tok_embd);
 
-    //        struct ggml_tensor * wkv_states = build_copy_mask_state(
-    //                gf, kv_self.v_l[il], state_copy, state_mask,
-    //                hparams.n_embd_v_s(), n_seqs);
+        for (int il = 0; il < n_layer; ++il) {
+            const llama_layer * layer = &model.layers[il];
 
-    //        cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
-    //        token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
+            struct ggml_tensor * token_shift = lctx.build_rwkv_token_shift_load(
+                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+            );
 
-    //        struct ggml_tensor * x_norm_att = build_norm(cur, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
-    //        struct ggml_tensor * x_prev = ggml_concat(
-    //            ctx0,
-    //            token_shift,
-    //            ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
-    //            1
-    //        );
+            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
+            cb(att_norm, "attn_norm", il);
 
-    //        struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                ggml_view_1d(ctx0, last_norm_att, n_embd * n_seqs, 0),
-    //                ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
+            struct ggml_tensor * x_prev = ggml_concat(
+                ctx0,
+                token_shift,
+                ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0),
+                1
+            );
 
-    //        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, build_rwkv6_time_mix(layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
-    //        ggml_build_forward_expand(gf, ffn_inp);
-    //        ggml_build_forward_expand(
-    //            gf,
-    //            ggml_cpy(
-    //                ctx0,
-    //                wkv_states,
-    //                ggml_view_1d(
-    //                    ctx0,
-    //                    kv_self.v_l[il],
-    //                    hparams.n_embd_v_s() * n_seqs,
-    //                    hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
-    //                )
-    //            )
-    //        );
+            cur = lctx.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
 
-    //        cb(ffn_inp, "ffn_inp", il);
+            token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
+            ggml_build_forward_expand(gf, lctx.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
 
-    //        // feed-forward network
-    //        cur = build_norm(ffn_inp,
-    //                model.layers[il].ffn_norm, NULL,
-    //                LLM_NORM_RMS, il);
-    //        cb(cur, "ffn_norm", il);
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
 
-    //        cur = build_ffn(cur,
-    //                model.layers[il].ffn_up,   NULL, NULL,
-    //                model.layers[il].ffn_gate, NULL, NULL,
-    //                model.layers[il].ffn_down, NULL, NULL,
-    //                NULL,
-    //                LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-    //        cb(cur, "ffn_out", il);
+            // feed-forward network
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
 
-    //        cur = ggml_add(ctx0, cur, ffn_inp);
-    //        cur = lctx.cvec.apply_to(ctx0, cur, il);
-    //        cb(cur, "l_out", il);
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
 
-    //        // input for next layer
-    //        inpL = cur;
-    //    }
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
 
-    //    cur = inpL;
-    //    struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-    //    cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
-    //    cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+            // input for next layer
+            inpL = cur;
+        }
 
-    //    cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
-    //    cb(cur, "result_norm", -1);
+        cur = inpL;
+        struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+        cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
-    //    cur = build_lora_mm(model.output, cur);
-    //    cb(cur, "result_output", -1);
+        cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+        cb(cur, "result_norm", -1);
 
-    //    ggml_build_forward_expand(gf, cur);
+        cur = build_lora_mm(model.output, cur);
+        cb(cur, "result_output", -1);
 
-    //    return gf;
-    //}
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 
     // ref: https://github.com/facebookresearch/chameleon
     // based on the original build_llama() function, changes:
@@ -7726,14 +7537,14 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_exaone();
             } break;
-        //case LLM_ARCH_RWKV6:
-        //    {
-        //        result = llm.build_rwkv6();
-        //    } break;
-        //case LLM_ARCH_RWKV6QWEN2:
-        //    {
-        //        result = llm.build_rwkv6qwen2();
-        //    } break;
+        case LLM_ARCH_RWKV6:
+            {
+                result = llm.build_rwkv6();
+            } break;
+        case LLM_ARCH_RWKV6QWEN2:
+            {
+                result = llm.build_rwkv6qwen2();
+            } break;
         case LLM_ARCH_CHAMELEON:
             {
                 result = llm.build_chameleon();

From e0d913fccbffe7913b2fa6a00590ca68800c9b59 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Feb 2025 10:02:50 +0200
Subject: [PATCH 22/28] llama : clear whitespaces

---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 171ea2017..f03386af4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6850,7 +6850,7 @@ struct llm_build_context {
             cur = build_rwkv_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6);
             cur = ggml_add(ctx0, cur, ffn_inp);
 
-            token_shift = ggml_concat(ctx0, 
+            token_shift = ggml_concat(ctx0,
                 ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)),
                 ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
                 1

From b15fede7a9a044d0a15da03b9ceb08f7007bfc95 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Feb 2025 14:34:45 +0200
Subject: [PATCH 23/28] kv-cache : fix defrag condition

ggml-ci
---
 src/llama-context.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 3bc0513ca..719622eaa 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -532,11 +532,13 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
         // decide if we need to defrag the kv cache
         if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
-            const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
+            // - do not defrag small contexts (i.e. < 2048 tokens)
+            // - count the padding towards the number of used tokens
+            const float fragmentation = kv_self.n >= 2048 ? 1.0f - float(kv_self.used + lctx.get_ctx_padding(cparams))/float(kv_self.n) : 0.0f;
 
             // queue defragmentation for next llama_kv_cache_update
             if (fragmentation > cparams.defrag_thold) {
-                //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
+                LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
 
                 kv_self.defrag();
             }

From f9971ef2e1754f8dde65d5fc0602b7719a0c5326 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 14:59:51 +0200
Subject: [PATCH 24/28] llama : dedup reserve code

---
 src/llama.cpp | 50 ++------------------------------------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 465938cf0..e89e70bbe 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7629,30 +7629,6 @@ static int llama_decode_impl(
             return -3;
         }
 
-        // reserve a worst case graph if needed
-        // TODO: extract to a function
-        if (lctx.need_reserve) {
-            const auto & cparams = lctx.cparams;
-            const auto & model   = lctx.model;
-
-            // build worst-case graph
-            uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-            uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-            ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
-
-            // initialize scheduler with the worst-case graph
-            ggml_backend_sched_reset(lctx.sched.get());
-            if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
-                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-            }
-
-            lctx.need_reserve = false;
-        }
-
         ggml_backend_sched_reset(lctx.sched.get());
         ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
@@ -7889,30 +7865,8 @@ static int llama_encode_impl(
 
     //batch_manager->prepare(ubatch);
 
-    // reserve a worst case graph if needed
-    // TODO: extract to a function
-    if (lctx.need_reserve) {
-        // TODO: extract to a function
-        const auto & cparams = lctx.cparams;
-        const auto & model   = lctx.model;
-
-        // build worst-case graph
-        uint32_t n_seqs = 1; // TODO: worst-case number of sequences
-        uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
-        llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-        llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
-
-        // initialize scheduler with the worst-case graph
-        ggml_backend_sched_reset(lctx.sched.get());
-        if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
-            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
-        }
-
-        lctx.need_reserve = false;
-    }
+    // TODO: do reserve
+    GGML_ASSERT(lctx.need_reserve == false);
 
     ggml_backend_sched_reset(lctx.sched.get());
     ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);

From 879ba82777b93f30c32eca731d0bf03e7fd20be7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 15:00:02 +0200
Subject: [PATCH 25/28] server : increase context size for the tests

ggml-ci
---
 examples/server/tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index ce0680662..97d650a9c 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -280,7 +280,7 @@ class ServerPreset:
         server.model_hf_repo = "ggml-org/models"
         server.model_hf_file = "tinyllamas/stories260K.gguf"
         server.model_alias = "tinyllama-2"
-        server.n_ctx = 256
+        server.n_ctx = 512
         server.n_batch = 32
         server.n_slots = 2
         server.n_predict = 64

From ef358ee78f08e4d7af3916e0d101925c5bc6e122 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 16:11:17 +0200
Subject: [PATCH 26/28] context : add decode/encode

ggml-ci
---
 src/llama-context.cpp | 630 ++++++++++++++++++++++++++++++++++--------
 src/llama-context.h   |  32 +--
 src/llama.cpp         | 386 +-------------------------
 3 files changed, 526 insertions(+), 522 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 7705d583b..5d21dd5ef 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -9,6 +9,121 @@
 #include <stdexcept>
 #include <cinttypes>
 
+// llama output (TMP)
+
+// Make sure enough space is available for outputs.
+// Returns max number of outputs for which space was reserved.
+static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
+    const auto & cparams = lctx.cparams;
+    const auto & hparams = lctx.model.hparams;
+    const auto & vocab   = lctx.model.vocab;
+
+    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (lctx.output_ids.empty()) {
+        // init, never resized afterwards
+        lctx.output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!lctx.buf_output || prev_size < new_size) {
+        if (lctx.buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            lctx.buf_output = nullptr;
+            lctx.logits = nullptr;
+            lctx.embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = lctx.model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (lctx.buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
+        }
+    }
+
+    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
+
+    lctx.logits = has_logits ? output_base               : nullptr;
+    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
+
+    lctx.output_size = n_outputs_max;
+    lctx.logits_size = logits_size;
+    lctx.embd_size   = embd_size;
+
+    // set all ids as invalid (negative)
+    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
+
+    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
+
+    lctx.n_outputs = 0;
+
+    return n_outputs_max;
+}
+
+// make the outputs have the same order they had in the user-provided batch
+static void llama_output_reorder(struct llama_context & ctx) {
+    std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
+        const uint32_t n_embd  = ctx.model.hparams.n_embd;
+
+        const int32_t n_outputs = ctx.n_outputs;
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (ctx.logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
+                }
+            }
+            if (ctx.embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            ctx.output_ids[out_ids[i]] = i;
+        }
+        out_ids.clear();
+    }
+}
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
     // TODO move to hparams if a T5 variant appears that uses a different value
     const int64_t max_distance = 128;
@@ -340,6 +455,20 @@ llama_context::llama_context(
 
 }
 
+struct llama_batch_manager_i {
+    virtual ~llama_batch_manager_i() = default;
+
+    virtual bool is_done() const = 0;
+    virtual llama_ubatch next() = 0;
+    virtual bool prepare() = 0;
+    virtual void restore() = 0;
+    virtual void update() = 0;
+    virtual void finalize() = 0;
+
+    // TODO: might be temporary
+    int64_t n_outputs_all = 0;
+};
+
 struct llama_batch_manager : public llama_batch_manager_i {
     llama_batch_manager(llama_context & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) {
         const auto & model   = lctx.model;
@@ -398,6 +527,10 @@ struct llama_batch_manager : public llama_batch_manager_i {
     ~llama_batch_manager() override {
     }
 
+    virtual bool is_done() const override {
+        return lctx.sbatch.n_tokens == 0;
+    }
+
     virtual llama_ubatch next() override {
         ubatch = llama_ubatch();
 
@@ -558,6 +691,390 @@ std::unique_ptr<llama_batch_manager_i> llama_context::prepare_batch(const llama_
     return std::make_unique<llama_batch_manager>(*this, batch);
 }
 
+int llama_context::decode(llama_batch & inp_batch) {
+    is_encoding = false;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+
+    const auto & vocab   = model.vocab;
+    const auto & hparams = model.hparams;
+
+    const int32_t n_vocab = vocab.n_tokens();
+    const int64_t n_embd  = hparams.n_embd;
+
+    // TODO: try catch
+    auto bman = prepare_batch(batch);
+
+    const auto n_outputs_all = bman->n_outputs_all;
+
+    // reserve output buffer
+    // TODO: move to batch manager?
+    if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+        return -2;
+    };
+
+    int64_t n_outputs_prev = 0;
+
+    while (!bman->is_done()) {
+        llama_ubatch ubatch = bman->next();
+
+        if (!bman->prepare()) {
+            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
+            bman->restore();
+            return -3;
+        }
+
+        ggml_backend_sched_reset(sched.get());
+        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+        ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+
+        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+
+        ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+        set_inputs(ubatch);
+
+        // the output is always the last tensor in the graph
+        struct ggml_tensor * t_logits = ggml_graph_node(gf, -1);
+        struct ggml_tensor * t_embd   = ggml_graph_node(gf, -2);
+
+        if (n_outputs == 0) {
+            // no output
+            t_logits  = nullptr;
+            t_embd = nullptr;
+        } else if (cparams.embeddings) {
+            t_logits  = nullptr; // do not extract logits for embedding case
+            t_embd = nullptr;
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    t_embd = ggml_graph_node(gf, i);
+                    break;
+                }
+            }
+            GGML_ASSERT(t_embd != nullptr && "missing embeddings tensor");
+        } else {
+            t_embd = nullptr; // do not extract embeddings when not needed
+            GGML_ASSERT(strcmp(t_logits->name, "result_output") == 0 && "missing result_output tensor");
+        }
+
+        const auto compute_status = compute_graph(gf, ubatch.n_tokens > 1);
+        if (compute_status != GGML_STATUS_SUCCESS) {
+            bman->restore();
+            switch (compute_status) {
+                case GGML_STATUS_ABORTED:
+                    return 2;
+                case GGML_STATUS_ALLOC_FAILED:
+                    return -2;
+                case GGML_STATUS_FAILED:
+                default:
+                    return -3;
+            }
+        }
+
+        bman->update();
+
+        // plot the computation graph in dot format (for debugging purposes)
+        //if (n_past%100 == 0) {
+        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
+        //}
+
+        // extract logits
+        if (t_logits) {
+            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
+            GGML_ASSERT(backend_res != nullptr);
+            GGML_ASSERT(logits != nullptr);
+
+            float * logits_out = logits + n_outputs_prev*n_vocab;
+            const int32_t n_outputs_new = n_outputs;
+
+            if (n_outputs_new) {
+                GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
+                GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) logits_size);
+                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
+            }
+        }
+
+        // extract embeddings
+        if (t_embd) {
+            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+            GGML_ASSERT(backend_embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(embd != nullptr);
+                        float * embd_out = embd + n_outputs_prev*n_embd;
+                        const int32_t n_outputs_new = n_outputs;
+
+                        if (n_outputs_new) {
+                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
+                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) embd_size);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings (cleared before processing each batch)
+                        auto & embd_seq_out = embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // extract the rerank score - a single float per sequence
+                        auto & embd_seq_out = embd_seq;
+
+                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
+                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(1);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+
+        n_outputs_prev += n_outputs;
+    }
+
+    // set output mappings
+    {
+        bool sorted_output = true;
+
+        GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+
+        for (size_t i = 0; i < (size_t) n_outputs_all; ++i) {
+            size_t out_id = sbatch.out_ids[i];
+            output_ids[out_id] = i;
+            if (out_id != i) {
+                sorted_output = false;
+            }
+        }
+
+        if (sorted_output) {
+            sbatch.out_ids.clear();
+        }
+    }
+
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    n_outputs = n_outputs_all;
+
+    // wait for the computation to finish (automatically done when obtaining the model output)
+    //llama_synchronize(&;
+
+    bman->finalize();
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
+int llama_context::encode(llama_batch & inp_batch) {
+    is_encoding = true;
+
+    if (inp_batch.n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
+        return -1;
+    }
+
+    // temporary allocate memory for the input batch if needed
+    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : pos_max() + 1);
+
+    const llama_batch & batch = batch_allocr.batch;
+    const uint32_t n_tokens = batch.n_tokens;
+
+    const auto & hparams = model.hparams;
+
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
+    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
+    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
+
+    if (t_compute_start_us == 0) {
+        t_compute_start_us = ggml_time_us();
+    }
+
+    n_queued_tokens += n_tokens;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+
+    const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
+
+    // reserve output buffer
+    if (llama_output_reserve(*this, n_tokens) < n_tokens) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+        return -2;
+    };
+
+    for (uint32_t i = 0; i < n_tokens; ++i) {
+        output_ids[i] = i;
+    }
+
+    inp_embd_enc = NULL;
+    n_outputs = n_tokens;
+
+    //batch_manager->prepare(ubatch);
+
+    // TODO: do reserve
+    GGML_ASSERT(need_reserve == false);
+
+    ggml_backend_sched_reset(sched.get());
+    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+    ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+
+    ggml_backend_sched_alloc_graph(sched.get(), gf);
+
+    set_inputs(ubatch);
+
+    // the output embeddings after the final encoder normalization
+    struct ggml_tensor * t_embd = nullptr;
+
+    // there are two cases here
+    if (llama_model_has_decoder(&model)) {
+        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
+        t_embd = ggml_graph_node(gf, -1);
+        GGML_ASSERT(strcmp(t_embd->name, "result_norm") == 0 && "missing result_output tensor");
+    } else {
+        // second case is an encoder-only T5 model
+        if (cparams.embeddings) {
+            // only output embeddings if required
+            t_embd = ggml_graph_node(gf, -1);
+            if (strcmp(t_embd->name, "result_embd_pooled") != 0) {
+                t_embd = ggml_graph_node(gf, -2);
+            }
+            GGML_ASSERT(strcmp(t_embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+        }
+    }
+
+    const auto compute_status = compute_graph(gf, n_tokens > 1);
+    switch (compute_status) {
+        case GGML_STATUS_SUCCESS:
+            break;
+        case GGML_STATUS_ABORTED:
+            return 2;
+        case GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
+
+    // extract embeddings
+    if (t_embd) {
+        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
+        GGML_ASSERT(backend_embd != nullptr);
+
+        if (llama_model_has_decoder(&model)) {
+            embd_enc.resize(n_tokens*n_embd);
+            float * embd_out = embd_enc.data();
+
+            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+            // remember the sequence ids used during the encoding - needed for cross attention later
+            seq_ids_enc.resize(n_tokens);
+            for (uint32_t i = 0; i < n_tokens; i++) {
+                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
+                    llama_seq_id seq_id = ubatch.seq_id[i][s];
+                    seq_ids_enc[i].insert(seq_id);
+                }
+            }
+        } else {
+            GGML_ASSERT(embd != nullptr);
+
+            switch (cparams.pooling_type) {
+                case LLAMA_POOLING_TYPE_NONE:
+                    {
+                        // extract token embeddings
+                        GGML_ASSERT(embd != nullptr);
+                        float * embd_out = embd;
+
+                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
+                        ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+                    } break;
+                case LLAMA_POOLING_TYPE_MEAN:
+                case LLAMA_POOLING_TYPE_CLS:
+                case LLAMA_POOLING_TYPE_LAST:
+                    {
+                        // extract sequence embeddings
+                        auto & embd_seq_out = embd_seq;
+                        embd_seq_out.clear();
+
+                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
+
+                        for (uint32_t i = 0; i < n_tokens; i++) {
+                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+                                continue;
+                            }
+                            embd_seq_out[seq_id].resize(n_embd);
+                            ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+                        }
+                    } break;
+                case LLAMA_POOLING_TYPE_RANK:
+                    {
+                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
+                        //       wait for an encoder model that requires this pooling type in order to test it
+                        //       https://github.com/ggerganov/llama.cpp/pull/9510
+                        GGML_ABORT("RANK pooling not implemented yet");
+                    }
+                case LLAMA_POOLING_TYPE_UNSPECIFIED:
+                    {
+                        GGML_ABORT("unknown pooling type");
+                    }
+            }
+        }
+    }
+
+    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+    // overlap with device computation.
+    ggml_backend_sched_reset(sched.get());
+
+    return 0;
+}
+
 enum ggml_status llama_context::compute_graph(
             ggml_cgraph * graph,
                    bool   batched) {
@@ -2194,119 +2711,6 @@ ggml_tensor * llama_context::build_rwkv6_time_mix(
     return cur;
 }
 
-// llama output
-
-size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
-    const auto & cparams = lctx.cparams;
-    const auto & hparams = lctx.model.hparams;
-    const auto & vocab   = lctx.model.vocab;
-
-    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
-
-    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd  = hparams.n_embd;
-
-    // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
-    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
-
-    if (lctx.output_ids.empty()) {
-        // init, never resized afterwards
-        lctx.output_ids.resize(n_batch);
-    }
-
-    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
-    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
-
-    // alloc only when more than the current capacity is required
-    // TODO: also consider shrinking the buffer
-    if (!lctx.buf_output || prev_size < new_size) {
-        if (lctx.buf_output) {
-#ifndef NDEBUG
-            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
-            lctx.buf_output = nullptr;
-            lctx.logits = nullptr;
-            lctx.embd = nullptr;
-        }
-
-        auto * buft = ggml_backend_cpu_buffer_type();
-        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = lctx.model.dev_output();
-        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
-        if (output_dev_host_buft) {
-            buft = output_dev_host_buft;
-        }
-        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
-        if (lctx.buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
-            return 0;
-        }
-    }
-
-    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
-
-    lctx.logits = has_logits ? output_base               : nullptr;
-    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
-
-    lctx.output_size = n_outputs_max;
-    lctx.logits_size = logits_size;
-    lctx.embd_size   = embd_size;
-
-    // set all ids as invalid (negative)
-    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
-
-    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
-
-    lctx.n_outputs = 0;
-
-    return n_outputs_max;
-}
-
-void llama_output_reorder(struct llama_context & ctx) {
-    std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
-        const uint32_t n_embd  = ctx.model.hparams.n_embd;
-
-        const int32_t n_outputs = ctx.n_outputs;
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
-
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (ctx.logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
-                }
-            }
-            if (ctx.embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            ctx.output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
-
 //
 // interface implementation
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 4cf4a6312..f6d63eb3c 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -16,22 +16,7 @@
 
 using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;
 
-// TODO: this is very WIP - improve
-struct llama_batch_manager_i {
-    virtual ~llama_batch_manager_i() = default;
-
-    //bool is_done() const;
-
-    virtual llama_ubatch next() = 0;
-
-    virtual bool prepare() = 0;
-    virtual void restore() = 0;
-    virtual void update() = 0;
-    virtual void finalize() = 0;
-
-    // TODO: might be temporary
-    int64_t n_outputs_all = 0;
-};
+struct llama_batch_manager_i;
 
 // TODO: make implementation details private
 // TODO: become abstract base class, split the current implementation into different child classes
@@ -44,6 +29,8 @@ struct llama_context {
             const llama_context_params & params,
             build_graph_callback && cb_build_graph);
 
+    virtual ~llama_context() = default;
+
     const struct llama_model & model;
 
     llama_cparams      cparams;
@@ -104,8 +91,10 @@ struct llama_context {
     ggml_abort_callback abort_callback      = nullptr;
     void *              abort_callback_data = nullptr;
 
-    // TODO: do not pass logits_all explicitly
-    std::unique_ptr<llama_batch_manager_i> prepare_batch(const llama_batch & batch);
+    virtual std::unique_ptr<llama_batch_manager_i> prepare_batch(const llama_batch & batch);
+
+    virtual int decode(llama_batch & inp_batch);
+    virtual int encode(llama_batch & inp_batch);
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
     enum ggml_status compute_graph(
@@ -286,13 +275,6 @@ struct llama_context {
     int n_pos_per_token = 1;
 };
 
-// Make sure enough space is available for outputs.
-// Returns max number of outputs for which space was reserved.
-size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
-
-// make the outputs have the same order they had in the user-provided batch
-void llama_output_reorder(struct llama_context & ctx);
-
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index e89e70bbe..ed5e1e525 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7584,213 +7584,7 @@ static struct ggml_cgraph * llama_build_graph(
 static int llama_decode_impl(
          llama_context & lctx,
            llama_batch   inp_batch) {
-
-    lctx.is_encoding = false;
-
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
-
-    const llama_batch & batch = batch_allocr.batch;
-
-    const auto & model   = lctx.model;
-    const auto & vocab   = model.vocab;
-    const auto & cparams = lctx.cparams;
-    const auto & hparams = lctx.model.hparams;
-
-    const int32_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;
-
-    // TODO: try catch
-    auto bman = lctx.prepare_batch(batch);
-
-    const auto n_outputs_all = bman->n_outputs_all;
-
-    // reserve output buffer
-    // TODO: move to batch manager?
-    if (llama_output_reserve(lctx, bman->n_outputs_all) < (size_t) n_outputs_all) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
-        return -2;
-    };
-
-    int64_t n_outputs_prev = 0;
-
-    while (lctx.sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = bman->next();
-
-        if (!bman->prepare()) {
-            LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
-            bman->restore();
-            return -3;
-        }
-
-        ggml_backend_sched_reset(lctx.sched.get());
-        ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
-
-        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
-
-        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
-
-        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-        lctx.set_inputs(ubatch);
-
-        // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
-        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
-
-        if (lctx.n_outputs == 0) {
-            // no output
-            res  = nullptr;
-            embd = nullptr;
-        } else if (cparams.embeddings) {
-            res  = nullptr; // do not extract logits for embedding case
-            embd = nullptr;
-            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
-                    embd = ggml_graph_node(gf, i);
-                    break;
-                }
-            }
-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
-        } else {
-            embd = nullptr; // do not extract embeddings when not needed
-            GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
-        }
-
-        const auto compute_status = lctx.compute_graph(gf, ubatch.n_tokens > 1);
-        if (compute_status != GGML_STATUS_SUCCESS) {
-            bman->restore();
-            switch (compute_status) {
-                case GGML_STATUS_ABORTED:
-                    return 2;
-                case GGML_STATUS_ALLOC_FAILED:
-                    return -2;
-                case GGML_STATUS_FAILED:
-                default:
-                    return -3;
-            }
-        }
-
-        bman->update();
-
-        // plot the computation graph in dot format (for debugging purposes)
-        //if (n_past%100 == 0) {
-        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
-        //}
-
-        // extract logits
-        if (res) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res);
-            GGML_ASSERT(backend_res != nullptr);
-            GGML_ASSERT(lctx.logits != nullptr);
-
-            float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
-            const int32_t n_outputs_new = lctx.n_outputs;
-
-            if (n_outputs_new) {
-                GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
-                GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
-                ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
-            }
-        }
-
-        // extract embeddings
-        if (embd) {
-            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
-            GGML_ASSERT(backend_embd != nullptr);
-
-            switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
-                    {
-                        // extract token embeddings
-                        GGML_ASSERT(lctx.embd != nullptr);
-                        float * embd_out = lctx.embd + n_outputs_prev*n_embd;
-                        const int32_t n_outputs_new = lctx.n_outputs;
-
-                        if (n_outputs_new) {
-                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all);
-                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
-                    {
-                        // extract sequence embeddings (cleared before processing each batch)
-                        auto & embd_seq_out = lctx.embd_seq;
-
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_RANK:
-                    {
-                        // extract the rerank score - a single float per sequence
-                        auto & embd_seq_out = lctx.embd_seq;
-
-                        for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
-                            const llama_seq_id seq_id = ubatch.seq_id[s][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(1);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                    {
-                        GGML_ABORT("unknown pooling type");
-                    }
-            }
-        }
-
-        n_outputs_prev += lctx.n_outputs;
-    }
-
-    // set output mappings
-    {
-        bool sorted_output = true;
-
-        GGML_ASSERT(lctx.sbatch.out_ids.size() == (size_t) n_outputs_all);
-
-        for (size_t i = 0; i < (size_t) n_outputs_all; ++i) {
-            size_t out_id = lctx.sbatch.out_ids[i];
-            lctx.output_ids[out_id] = i;
-            if (out_id != i) {
-                sorted_output = false;
-            }
-        }
-
-        if (sorted_output) {
-            lctx.sbatch.out_ids.clear();
-        }
-    }
-
-    // set to total number of outputs in the batch, for use in llama_get_logits_ith
-    lctx.n_outputs = n_outputs_all;
-
-    // wait for the computation to finish (automatically done when obtaining the model output)
-    //llama_synchronize(&lctx);
-
-    bman->finalize();
-
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched.get());
-
-    return 0;
+    return lctx.decode(inp_batch);
 }
 
 // encode a batch of tokens by evaluating the encoder part of the transformer
@@ -7805,183 +7599,7 @@ static int llama_decode_impl(
 static int llama_encode_impl(
          llama_context & lctx,
            llama_batch   inp_batch) {
-
-    lctx.is_encoding = true;
-
-    if (inp_batch.n_tokens == 0) {
-        LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
-        return -1;
-    }
-
-    // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.pos_max() + 1);
-
-    const llama_batch & batch = batch_allocr.batch;
-    const uint32_t n_tokens = batch.n_tokens;
-
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-    const auto & cparams = lctx.cparams;
-
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
-
-    if (batch.token) {
-        for (uint32_t i = 0; i < n_tokens; ++i) {
-            if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
-                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
-                return -1;
-            }
-        }
-    }
-
-    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
-    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
-
-    if (lctx.t_compute_start_us == 0) {
-        lctx.t_compute_start_us = ggml_time_us();
-    }
-
-    lctx.n_queued_tokens += n_tokens;
-
-    const int64_t n_embd = hparams.n_embd;
-
-    lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
-
-    const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
-
-    // reserve output buffer
-    if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
-        return -2;
-    };
-
-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        lctx.output_ids[i] = i;
-    }
-
-    lctx.inp_embd_enc = NULL;
-    lctx.n_outputs = n_tokens;
-
-    //batch_manager->prepare(ubatch);
-
-    // TODO: do reserve
-    GGML_ASSERT(lctx.need_reserve == false);
-
-    ggml_backend_sched_reset(lctx.sched.get());
-    ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
-
-    ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
-
-    ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
-
-    lctx.set_inputs(ubatch);
-
-    // the output embeddings after the final encoder normalization
-    struct ggml_tensor * embd = nullptr;
-
-    // there are two cases here
-    if (llama_model_has_decoder(&lctx.model)) {
-        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = ggml_graph_node(gf, -1);
-        GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
-    } else {
-        // second case is an encoder-only T5 model
-        if (cparams.embeddings) {
-            // only output embeddings if required
-            embd = ggml_graph_node(gf, -1);
-            if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = ggml_graph_node(gf, -2);
-            }
-            GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
-        }
-    }
-
-    const auto compute_status = lctx.compute_graph(gf, n_tokens > 1);
-    switch (compute_status) {
-        case GGML_STATUS_SUCCESS:
-            break;
-        case GGML_STATUS_ABORTED:
-            return 2;
-        case GGML_STATUS_ALLOC_FAILED:
-            return -2;
-        case GGML_STATUS_FAILED:
-        default:
-            return -3;
-    }
-
-    // extract embeddings
-    if (embd) {
-        ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
-        GGML_ASSERT(backend_embd != nullptr);
-
-        if (llama_model_has_decoder(&lctx.model)) {
-            lctx.embd_enc.resize(n_tokens*n_embd);
-            float * embd_out = lctx.embd_enc.data();
-
-            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-            GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-            // remember the sequence ids used during the encoding - needed for cross attention later
-            lctx.seq_ids_enc.resize(n_tokens);
-            for (uint32_t i = 0; i < n_tokens; i++) {
-                for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
-                    llama_seq_id seq_id = ubatch.seq_id[i][s];
-                    lctx.seq_ids_enc[i].insert(seq_id);
-                }
-            }
-        } else {
-            GGML_ASSERT(lctx.embd != nullptr);
-
-            switch (cparams.pooling_type) {
-                case LLAMA_POOLING_TYPE_NONE:
-                    {
-                        // extract token embeddings
-                        GGML_ASSERT(lctx.embd != nullptr);
-                        float * embd_out = lctx.embd;
-
-                        GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
-                        ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
-                    } break;
-                case LLAMA_POOLING_TYPE_MEAN:
-                case LLAMA_POOLING_TYPE_CLS:
-                case LLAMA_POOLING_TYPE_LAST:
-                    {
-                        // extract sequence embeddings
-                        auto & embd_seq_out = lctx.embd_seq;
-                        embd_seq_out.clear();
-
-                        GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
-
-                        for (uint32_t i = 0; i < n_tokens; i++) {
-                            const llama_seq_id seq_id = ubatch.seq_id[i][0];
-                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
-                                continue;
-                            }
-                            embd_seq_out[seq_id].resize(n_embd);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
-                        }
-                    } break;
-                case LLAMA_POOLING_TYPE_RANK:
-                    {
-                        // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
-                        //       wait for an encoder model that requires this pooling type in order to test it
-                        //       https://github.com/ggerganov/llama.cpp/pull/9510
-                        GGML_ABORT("RANK pooling not implemented yet");
-                    }
-                case LLAMA_POOLING_TYPE_UNSPECIFIED:
-                    {
-                        GGML_ABORT("unknown pooling type");
-                    }
-            }
-        }
-    }
-
-    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
-    // overlap with device computation.
-    ggml_backend_sched_reset(lctx.sched.get());
-
-    return 0;
+    return lctx.encode(inp_batch);
 }
 
 //

From d1d8d530083a9bf3ada2427bf59e97fa58667365 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 16:50:14 +0200
Subject: [PATCH 27/28] bman : remove ubatch member

ggml-ci
---
 src/llama-context.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5d21dd5ef..4387128fe 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -460,9 +460,9 @@ struct llama_batch_manager_i {
 
     virtual bool is_done() const = 0;
     virtual llama_ubatch next() = 0;
-    virtual bool prepare() = 0;
+    virtual bool prepare(const llama_ubatch & ubatch) = 0;
     virtual void restore() = 0;
-    virtual void update() = 0;
+    virtual void update(const llama_ubatch & ubatch) = 0;
     virtual void finalize() = 0;
 
     // TODO: might be temporary
@@ -532,7 +532,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
     }
 
     virtual llama_ubatch next() override {
-        ubatch = llama_ubatch();
+        llama_ubatch ubatch = llama_ubatch();
 
         const auto & cparams = lctx.cparams;
         const auto & kv_self = lctx.kv_self;
@@ -557,7 +557,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
         return ubatch;
     }
 
-    virtual bool prepare() override {
+    virtual bool prepare(const llama_ubatch & ubatch) override {
         const auto & cparams = lctx.cparams;
         const auto & hparams = lctx.model.hparams;
         const auto & batch   = lctx.sbatch.batch;
@@ -644,7 +644,7 @@ struct llama_batch_manager : public llama_batch_manager_i {
         kv_slot_restorer.restore(lctx.kv_self);
     }
 
-    virtual void update() override {
+    virtual void update(const llama_ubatch & ubatch) override {
         auto & kv_self = lctx.kv_self;
 
         // update the kv ring buffer
@@ -682,8 +682,6 @@ struct llama_batch_manager : public llama_batch_manager_i {
 
     const llama_batch & batch;
 
-    llama_ubatch ubatch;
-
     llama_kv_slot_restorer kv_slot_restorer;
 };
 
@@ -728,7 +726,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     while (!bman->is_done()) {
         llama_ubatch ubatch = bman->next();
 
-        if (!bman->prepare()) {
+        if (!bman->prepare(ubatch)) {
             LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
             bman->restore();
             return -3;
@@ -782,7 +780,7 @@ int llama_context::decode(llama_batch & inp_batch) {
             }
         }
 
-        bman->update();
+        bman->update(ubatch);
 
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {

From 2cd8a903c84b9fbf91f256a6349e05e492a47421 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Feb 2025 17:01:27 +0200
Subject: [PATCH 28/28] context : make output functions members

ggml-ci
---
 src/llama-context.cpp | 238 ++++++++++++++++++++----------------------
 src/llama-context.h   |   8 ++
 2 files changed, 122 insertions(+), 124 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4387128fe..87d6642da 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -9,121 +9,6 @@
 #include <stdexcept>
 #include <cinttypes>
 
-// llama output (TMP)
-
-// Make sure enough space is available for outputs.
-// Returns max number of outputs for which space was reserved.
-static size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
-    const auto & cparams = lctx.cparams;
-    const auto & hparams = lctx.model.hparams;
-    const auto & vocab   = lctx.model.vocab;
-
-    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
-
-    const auto n_batch = cparams.n_batch;
-    const auto n_vocab = vocab.n_tokens();
-    const auto n_embd  = hparams.n_embd;
-
-    // TODO: use a per-batch flag for logits presence instead
-    const bool has_logits = !cparams.embeddings;
-    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
-    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
-
-    if (lctx.output_ids.empty()) {
-        // init, never resized afterwards
-        lctx.output_ids.resize(n_batch);
-    }
-
-    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
-    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
-
-    // alloc only when more than the current capacity is required
-    // TODO: also consider shrinking the buffer
-    if (!lctx.buf_output || prev_size < new_size) {
-        if (lctx.buf_output) {
-#ifndef NDEBUG
-            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
-            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
-            lctx.buf_output = nullptr;
-            lctx.logits = nullptr;
-            lctx.embd = nullptr;
-        }
-
-        auto * buft = ggml_backend_cpu_buffer_type();
-        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = lctx.model.dev_output();
-        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
-        if (output_dev_host_buft) {
-            buft = output_dev_host_buft;
-        }
-        lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
-        if (lctx.buf_output == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
-            return 0;
-        }
-    }
-
-    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
-
-    lctx.logits = has_logits ? output_base               : nullptr;
-    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;
-
-    lctx.output_size = n_outputs_max;
-    lctx.logits_size = logits_size;
-    lctx.embd_size   = embd_size;
-
-    // set all ids as invalid (negative)
-    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
-
-    ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
-
-    lctx.n_outputs = 0;
-
-    return n_outputs_max;
-}
-
-// make the outputs have the same order they had in the user-provided batch
-static void llama_output_reorder(struct llama_context & ctx) {
-    std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = ctx.model.vocab.n_tokens();
-        const uint32_t n_embd  = ctx.model.hparams.n_embd;
-
-        const int32_t n_outputs = ctx.n_outputs;
-        GGML_ASSERT((size_t) n_outputs == out_ids.size());
-
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (ctx.logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
-                }
-            }
-            if (ctx.embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            ctx.output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
 static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
     // TODO move to hparams if a T5 variant appears that uses a different value
     const int64_t max_distance = 128;
@@ -334,7 +219,7 @@ llama_context::llama_context(
         // graph outputs buffer
         {
             // resized during inference when a batch uses more outputs
-            if (llama_output_reserve(*this, params.n_seq_max) < params.n_seq_max) {
+            if (reserve_outputs(params.n_seq_max) < params.n_seq_max) {
                 LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
                 throw std::runtime_error("failed to reserve initial output buffer");
             }
@@ -716,7 +601,7 @@ int llama_context::decode(llama_batch & inp_batch) {
 
     // reserve output buffer
     // TODO: move to batch manager?
-    if (llama_output_reserve(*this, bman->n_outputs_all) < (size_t) n_outputs_all) {
+    if (reserve_outputs(bman->n_outputs_all) < (size_t) n_outputs_all) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
         return -2;
     };
@@ -940,7 +825,7 @@ int llama_context::encode(llama_batch & inp_batch) {
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
 
     // reserve output buffer
-    if (llama_output_reserve(*this, n_tokens) < n_tokens) {
+    if (reserve_outputs(n_tokens) < n_tokens) {
         LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
         return -2;
     };
@@ -1555,6 +1440,113 @@ void llama_context::set_inputs(const llama_ubatch & ubatch) {
     }
 }
 
+void llama_context::reorder_outputs() {
+    std::vector<size_t> & out_ids = sbatch.out_ids;
+    if (!out_ids.empty()) {
+        const uint32_t n_vocab = model.vocab.n_tokens();
+        const uint32_t n_embd  = model.hparams.n_embd;
+
+        GGML_ASSERT((size_t) n_outputs == out_ids.size());
+
+        // TODO: is there something more efficient which also minimizes swaps?
+        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+        for (int32_t i = 0; i < n_outputs - 1; ++i) {
+            int32_t j_min = i;
+            for (int32_t j = i + 1; j < n_outputs; ++j) {
+                if (out_ids[j] < out_ids[j_min]) {
+                    j_min = j;
+                }
+            }
+            if (j_min == i) { continue; }
+            std::swap(out_ids[i], out_ids[j_min]);
+            if (logits_size > 0) {
+                for (uint32_t k = 0; k < n_vocab; k++) {
+                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                }
+            }
+            if (embd_size > 0) {
+                for (uint32_t k = 0; k < n_embd; k++) {
+                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+                }
+            }
+        }
+        std::fill(output_ids.begin(), output_ids.end(), -1);
+        for (int32_t i = 0; i < n_outputs; ++i) {
+            output_ids[out_ids[i]] = i;
+        }
+        out_ids.clear();
+    }
+}
+
+size_t llama_context::reserve_outputs(size_t n_outputs) {
+    const auto & hparams = model.hparams;
+    const auto & vocab   = model.vocab;
+
+    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
+
+    const auto n_batch = cparams.n_batch;
+    const auto n_vocab = vocab.n_tokens();
+    const auto n_embd  = hparams.n_embd;
+
+    // TODO: use a per-batch flag for logits presence instead
+    const bool has_logits = !cparams.embeddings;
+    const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+
+    logits_size = has_logits ? n_vocab*n_outputs_max : 0;
+    embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;
+
+    if (output_ids.empty()) {
+        // init, never resized afterwards
+        output_ids.resize(n_batch);
+    }
+
+    const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
+    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+
+    // alloc only when more than the current capacity is required
+    // TODO: also consider shrinking the buffer
+    if (!buf_output || prev_size < new_size) {
+        if (buf_output) {
+#ifndef NDEBUG
+            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
+            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+            buf_output = nullptr;
+            logits = nullptr;
+            embd = nullptr;
+        }
+
+        auto * buft = ggml_backend_cpu_buffer_type();
+        // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
+        auto * output_dev = model.dev_output();
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
+        }
+        buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
+        if (buf_output == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
+            return 0;
+        }
+    }
+
+    float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
+
+    logits = has_logits ? output_base               : nullptr;
+    embd   = has_embd   ? output_base + logits_size : nullptr;
+
+    output_size = n_outputs_max;
+
+    // set all ids as invalid (negative)
+    std::fill(output_ids.begin(), output_ids.end(), -1);
+
+    ggml_backend_buffer_clear(buf_output.get(), 0);
+
+    n_outputs = 0;
+
+    return n_outputs_max;
+}
+
 // do mat_mul, while optionally apply lora
 ggml_tensor * llama_context::build_lora_mm(
         ggml_context * ctx0,
@@ -2827,8 +2819,7 @@ float * llama_get_logits(struct llama_context * ctx) {
     llama_synchronize(ctx);
 
     // reorder logits for backward compatibility
-    // TODO: maybe deprecate this
-    llama_output_reorder(*ctx);
+    ctx->reorder_outputs();
 
     return ctx->logits;
 }
@@ -2877,8 +2868,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     llama_synchronize(ctx);
 
     // reorder embeddings for backward compatibility
-    // TODO: maybe deprecate this
-    llama_output_reorder(*ctx);
+    ctx->reorder_outputs();
 
     return ctx->embd;
 }
@@ -3187,7 +3177,7 @@ struct llama_data_write {
     //}
 
     void write_output_ids(struct llama_context * ctx) {
-        llama_output_reorder(*ctx);
+        ctx->reorder_outputs();
 
         const uint32_t n_outputs = ctx->n_outputs;
 
@@ -3281,7 +3271,7 @@ struct llama_data_read {
         uint32_t n_outputs;
         read_to(&n_outputs, sizeof(n_outputs));
 
-        if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
+        if (n_outputs > ctx->reserve_outputs(n_outputs)) {
             throw std::runtime_error("could not reserve outputs");
         }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index f6d63eb3c..8f22fd3b1 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -114,6 +114,14 @@ struct llama_context {
 
     void set_inputs(const llama_ubatch & ubatch);
 
+    // make the outputs have the same order they had in the user-provided batch
+    // TODO: maybe deprecate this
+    void reorder_outputs();
+
+    // Make sure enough space is available for outputs.
+    // Returns max number of outputs for which space was reserved.
+    size_t reserve_outputs(size_t n_outputs);
+
     ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
              ggml_tensor * w,