From 7824722c8cef13c807ec2cd09c9f3b101405cbb3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 27 Feb 2024 14:35:27 +0200
Subject: [PATCH] llama : fix graph size check during defrag

---
 llama.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 8c2fc9fde..0290899f6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8150,15 +8150,12 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
             nh++;
         }
 
-        // in the worst case each move requires 6*n_layer tensors
+        // each move requires 6*n_layer tensors (see build_defrag)
+        //   - source view, destination view, copy operation
+        //   - x2 for keys and values
         //
-        // TODO: ideally this should be:
-        //
-        //         if (6*(n_moves + nh)*n_layer > LLAMA_MAX_NODES) {
-        //
-        //       but when I do that, the defrag graph can not fit due to not enough memory - not sure why
-        //
-        if (6*(n_moves + nh)*n_layer > LLAMA_MAX_NODES/2) {
+        if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
+            // the graph is too big, we cannot move more cells
             break;
         }
 
@@ -12044,7 +12041,7 @@ struct llama_context * llama_new_context_with_model(
             }
 
             // buffer used to store the computation graph and the tensor meta data
-            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
+            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
 
             ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);