From 16e12ab734f4e3a9b49df6597890f0f73e972a21 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 20 Jan 2024 18:52:33 +0100
Subject: [PATCH] also duplicate gpu compute buffers to avoid races

---
 ggml-cuda.cu | 26 +++++++++++++++------
 llama.cpp    | 64 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9fe390054..4dd112915 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -10842,8 +10842,11 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
 GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
     GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
 
+    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
+
     // host -> device
-    if (ggml_backend_buffer_is_cuda_host(src->buffer) && ggml_backend_buffer_is_cuda(dst->buffer)) {
+    if (ggml_backend_buffer_is_cuda_host(buf_src) && ggml_backend_buffer_is_cuda(buf_dst)) {
         ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
         // make sure the data is ready on the source backend
         // the CPU backend does not support async compute, so this does nothing at the moment
@@ -10854,7 +10857,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
     }
 
     // device -> host
-    if (ggml_backend_buffer_is_cuda_host(dst->buffer) && ggml_backend_buffer_is_cuda(src->buffer)) {
+    if (ggml_backend_buffer_is_cuda_host(buf_dst) && ggml_backend_buffer_is_cuda(buf_src)) {
         // this shoudln't happen currently because the dst backend is our own backend, which does not support host buffers
         GGML_ASSERT(false);
         ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
@@ -10875,9 +10878,14 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
     ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
 
     if (backend_src != backend_dst) {
-        //printf("async copy between devices %s, %d -> %d\n", src->name, cuda_ctx_src->device, cuda_ctx_dst->device);
-        cudaDeviceSynchronize();
-        // TODO: reuse event?
+        ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
+        ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
+
+        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
+        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
+
+        ggml_cuda_set_device(cuda_ctx_src->device);
+
         cudaEvent_t event;
         CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
 
@@ -10885,12 +10893,16 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
         CUDA_CHECK(cudaEventRecord(event, g_cudaStreams[cuda_ctx_src->device][0]));
 
         // wait on dst stream
+        ggml_cuda_set_device(cuda_ctx_dst->device);
         CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], event, 0));
 
+        CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_dst->device][0]));
+
         CUDA_CHECK(cudaEventDestroy(event));
+    } else {
+        // copy
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
     }
-    // copy
-    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
     return true;
 }
 
diff --git a/llama.cpp b/llama.cpp
index 413dd0480..697030bd3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1663,6 +1663,16 @@ struct llama_model {
 struct llama_context {
     llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
     ~llama_context() {
+        for (auto & it : bufs_compute) {
+            // restore the original buffer in the tallocr
+            ggml_tallocr_t allocr = ggml_backend_sched_get_tallocr(sched, it.first);
+            ggml_tallocr_set_buffer(allocr, it.second[0]);
+            // free the rest of the buffers
+            for (size_t i = 1; i < it.second.size(); ++i) {
+                ggml_backend_buffer_free(it.second[i]);
+            }
+        }
+
         ggml_backend_sched_free(sched);
 
         for (ggml_backend_t backend : backends) {
@@ -1670,6 +1680,7 @@ struct llama_context {
         }
 
         ggml_backend_buffer_free(buf_logits);
+
     }
 
     llama_cparams cparams;
@@ -1719,10 +1730,11 @@ struct llama_context {
     std::vector<uint8_t> buf_compute_meta;
     ggml_backend_sched_t sched = nullptr;
     // allocator for the input tensors
-    ggml_tallocr * alloc_cpu = nullptr;
+    ggml_tallocr_t alloc_cpu = nullptr;
 
-    std::vector<ggml_backend_buffer_t> buf_cpu_ub;
-    size_t buf_cpu_ub_cur = 0;
+    std::map<ggml_backend_t, std::vector<ggml_backend_buffer_t>> bufs_compute;
+    size_t n_compute_bufs = 0;
+    size_t i_compute_buf = 0;
 
     // temporary buffer for copying data to/from the backend
     std::vector<no_init<uint8_t>> buf_copy;
@@ -6704,15 +6716,17 @@ static int llama_decode_internal(
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
         // change the CPU compute buffer to avoid overwriting inputs
-        size_t buf_cpu_ub_cur = lctx.buf_cpu_ub_cur;
-        lctx.buf_cpu_ub_cur = (lctx.buf_cpu_ub_cur + 1) % lctx.buf_cpu_ub.size();
-        if (buf_cpu_ub_cur == 0 && cur_token > 0) {
+        size_t i_compute_buf = lctx.i_compute_buf;
+        lctx.i_compute_buf = (lctx.i_compute_buf + 1) % lctx.n_compute_bufs;
+        if (i_compute_buf == 0 && cur_token > 0) {
             // sync all backends to ensure that the current buffer is not in use
             printf("not enough buffers, syncing now\n");
             ggml_backend_sched_synchronize(lctx.sched);
         }
-
-        ggml_tallocr_set_buffer(lctx.alloc_cpu, lctx.buf_cpu_ub.at(buf_cpu_ub_cur));
+        for (auto it : lctx.bufs_compute) {
+            ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(lctx.sched, it.first);
+            ggml_tallocr_set_buffer(alloc, it.second.at(i_compute_buf));
+        }
 
         ggml_backend_sched_reset(lctx.sched);
 
@@ -6833,7 +6847,7 @@ static int llama_decode_internal(
     }
 
     ggml_backend_sched_synchronize(lctx.sched);
-    lctx.buf_cpu_ub_cur = 0;
+    lctx.i_compute_buf = 0;
 
     // measure the performance only for the single-token evals
     if (n_tokens_all == 1) {
@@ -10003,14 +10017,26 @@ struct llama_context * llama_new_context_with_model(
             ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
 
             // duplicate cpu buffers for microbatching
-            ggml_backend_buffer_t buf_cpu = ggml_tallocr_get_buffer(ctx->alloc_cpu);
-            size_t buf_size = ggml_backend_buffer_get_size(buf_cpu);
-            ctx->buf_cpu_ub.push_back(buf_cpu);
-            int n_ub = 64;
-            for (int i = 1; i < n_ub; ++i) {
-                ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_size);
-                ctx->buf_cpu_ub.push_back(buf);
+            const int n_ub = 16;
+            ctx->n_compute_bufs = n_ub;
+
+            for (ggml_backend_t b : ctx->backends) {
+                ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
+                ggml_backend_buffer_t buf = ggml_tallocr_get_buffer(alloc);
+                size_t buf_size = ggml_backend_buffer_get_size(buf);
+                ctx->bufs_compute[b].push_back(buf);
+                auto * buft = ggml_backend_buffer_get_type(buf);
+                for (int i = 1; i < n_ub; ++i) {
+                    ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
+                    if (buf == nullptr) {
+                        LLAMA_LOG_ERROR("%s: failed to allocate compute buffer\n", __func__);
+                        llama_free(ctx);
+                        return nullptr;
+                    }
+                    ctx->bufs_compute[b].push_back(buf);
+                }
             }
+
             // allocate buffer for logits output
             ctx->buf_logits = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), hparams.n_vocab*cparams.n_ctx*sizeof(float));
             if (ctx->buf_logits == nullptr) {
@@ -10816,13 +10842,13 @@ int32_t llama_decode(
 
 float * llama_get_logits(struct llama_context * ctx) {
     ggml_backend_sched_synchronize(ctx->sched);
-    ctx->buf_cpu_ub_cur = 0;
+    ctx->i_compute_buf = 0;
     return ctx->logits;
 }
 
 float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
     ggml_backend_sched_synchronize(ctx->sched);
-    ctx->buf_cpu_ub_cur = 0;
+    ctx->i_compute_buf = 0;
 
     assert(ctx->logits_valid.at(i));
     return ctx->logits + i*ctx->model.hparams.n_vocab;
@@ -10830,7 +10856,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
 
 float * llama_get_embeddings(struct llama_context * ctx) {
     ggml_backend_sched_synchronize(ctx->sched);
-    ctx->buf_cpu_ub_cur = 0;
+    ctx->i_compute_buf = 0;
 
     return ctx->embedding.data();
 }