From b723fe7028d417046f38919dd8bea9a9698f0d5c Mon Sep 17 00:00:00 2001
From: Bach Le <bach@bullno1.com>
Date: Wed, 12 Jul 2023 22:59:04 +0800
Subject: [PATCH] Track and free temporary ggml_tensor_extra_gpu struct created
 during eval

---
 ggml-cuda.cu | 43 +++++++++++++++++++++++++++++++++++++++++--
 ggml-cuda.h  |  2 ++
 llama.cpp    |  6 ++++++
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 89e69bdc1..2a9aeaafa 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -239,6 +239,11 @@ struct ggml_tensor_extra_gpu {
     cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
 };
 
+struct ggml_tensor_extra_gpu_tmp {
+    struct ggml_tensor_extra_gpu main;
+    struct ggml_tensor_extra_gpu_tmp * next;
+};
+
 static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -2080,6 +2085,9 @@ struct cuda_buffer {
     size_t size = 0;
 };
 
+// TODO: Make this TLS?
+static struct ggml_tensor_extra_gpu_tmp * g_eval_extras = NULL;
+static bool g_evaluating = false;
 static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
 
@@ -3260,6 +3268,38 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
     delete extra;
 }
 
+void ggml_cuda_begin_eval() {
+    GGML_ASSERT(!g_evaluating);
+    g_evaluating = true;
+}
+
+struct ggml_tensor_extra_gpu * ggml_cuda_alloc_extra() {
+    if (g_evaluating) {
+        struct ggml_tensor_extra_gpu_tmp * extra = new ggml_tensor_extra_gpu_tmp;
+        memset(extra, 0, sizeof(*extra));
+        extra->next = g_eval_extras;
+        g_eval_extras = extra;
+        return &extra->main;
+    } else {
+        struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
+        return extra;
+    }
+}
+
+void ggml_cuda_end_eval() {
+    GGML_ASSERT(g_evaluating);
+
+    for (struct ggml_tensor_extra_gpu_tmp * i = g_eval_extras; i != nullptr;) {
+        struct ggml_tensor_extra_gpu_tmp * next = i->next;
+        delete i;
+        i = next;
+    }
+
+    g_eval_extras = nullptr;
+    g_evaluating = false;
+}
+
 void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
     if (scratch && g_scratch_size == 0) {
         return;
@@ -3277,8 +3317,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
     }
 
     tensor->backend = GGML_BACKEND_GPU;
-    struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
-    memset(extra, 0, sizeof(*extra));
+    struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_extra();
 
     const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
         tensor->op == GGML_OP_VIEW ||
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 3c1e8deb6..ccdf9c73d 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -30,6 +30,8 @@ void   ggml_cuda_set_main_device(int main_device);
 void   ggml_cuda_set_scratch_size(size_t scratch_size);
 void   ggml_cuda_free_scratch(void);
 bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+void   ggml_cuda_begin_eval(void);
+void   ggml_cuda_end_eval(void);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index 2d09d6ce7..86750fbf6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1379,6 +1379,8 @@ static bool llama_eval_internal(
     offload_func_t offload_func_v  = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
+    ggml_cuda_begin_eval();
+
     if (n_gpu_layers > n_layer) {
         offload_func_nr = ggml_cuda_assign_buffers;
     }
@@ -1721,6 +1723,10 @@ static bool llama_eval_internal(
             lctx.get_buf_max_mem(1)/1024.0/1024.0);
 #endif
 
+#ifdef GGML_USE_CUBLAS
+    ggml_cuda_end_eval();
+#endif
+
     ggml_free(ctx0);
 
     // measure the performance only for the single-token evals