also duplicate gpu compute buffers to avoid races

2024-01-20 18:52:33 +01:00 · 2024-01-20 18:52:33 +01:00 · 16e12ab734
commit 16e12ab734
parent a97198747f
2 changed files with 64 additions and 26 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -10842,8 +10842,11 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
 GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
    GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));

+    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
+
    // host -> device
-    if (ggml_backend_buffer_is_cuda_host(src->buffer) && ggml_backend_buffer_is_cuda(dst->buffer)) {
+    if (ggml_backend_buffer_is_cuda_host(buf_src) && ggml_backend_buffer_is_cuda(buf_dst)) {
        ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
        // make sure the data is ready on the source backend
        // the CPU backend does not support async compute, so this does nothing at the moment
@ -10854,7 +10857,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
    }

    // device -> host
-    if (ggml_backend_buffer_is_cuda_host(dst->buffer) && ggml_backend_buffer_is_cuda(src->buffer)) {
+    if (ggml_backend_buffer_is_cuda_host(buf_dst) && ggml_backend_buffer_is_cuda(buf_src)) {
        // this shoudln't happen currently because the dst backend is our own backend, which does not support host buffers
        GGML_ASSERT(false);
        ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
@ -10875,9 +10878,14 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;

    if (backend_src != backend_dst) {
-        //printf("async copy between devices %s, %d -> %d\n", src->name, cuda_ctx_src->device, cuda_ctx_dst->device);
-        cudaDeviceSynchronize();
-        // TODO: reuse event?
+        ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
+        ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
+
+        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
+        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
+
+        ggml_cuda_set_device(cuda_ctx_src->device);
+
        cudaEvent_t event;
        CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));

@ -10885,12 +10893,16 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
        CUDA_CHECK(cudaEventRecord(event, g_cudaStreams[cuda_ctx_src->device][0]));

        // wait on dst stream
+        ggml_cuda_set_device(cuda_ctx_dst->device);
        CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], event, 0));

+        CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_dst->device][0]));
+
        CUDA_CHECK(cudaEventDestroy(event));
+    } else {
+        // copy
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
    }
-    // copy
-    CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0]));
    return true;
 }

--- a/llama.cpp
+++ b/llama.cpp
@ -1663,6 +1663,16 @@ struct llama_model {
 struct llama_context {
    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
    ~llama_context() {
+        for (auto & it : bufs_compute) {
+            // restore the original buffer in the tallocr
+            ggml_tallocr_t allocr = ggml_backend_sched_get_tallocr(sched, it.first);
+            ggml_tallocr_set_buffer(allocr, it.second[0]);
+            // free the rest of the buffers
+            for (size_t i = 1; i < it.second.size(); ++i) {
+                ggml_backend_buffer_free(it.second[i]);
+            }
+        }
+
        ggml_backend_sched_free(sched);

        for (ggml_backend_t backend : backends) {
@ -1670,6 +1680,7 @@ struct llama_context {
        }

        ggml_backend_buffer_free(buf_logits);
+
    }

    llama_cparams cparams;
@ -1719,10 +1730,11 @@ struct llama_context {
    std::vector<uint8_t> buf_compute_meta;
    ggml_backend_sched_t sched = nullptr;
    // allocator for the input tensors
-    ggml_tallocr * alloc_cpu = nullptr;
+    ggml_tallocr_t alloc_cpu = nullptr;

-    std::vector<ggml_backend_buffer_t> buf_cpu_ub;
-    size_t buf_cpu_ub_cur = 0;
+    std::map<ggml_backend_t, std::vector<ggml_backend_buffer_t>> bufs_compute;
+    size_t n_compute_bufs = 0;
+    size_t i_compute_buf = 0;

    // temporary buffer for copying data to/from the backend
    std::vector<no_init<uint8_t>> buf_copy;
@ -6704,15 +6716,17 @@ static int llama_decode_internal(
        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);

        // change the CPU compute buffer to avoid overwriting inputs
-        size_t buf_cpu_ub_cur = lctx.buf_cpu_ub_cur;
-        lctx.buf_cpu_ub_cur = (lctx.buf_cpu_ub_cur + 1) % lctx.buf_cpu_ub.size();
-        if (buf_cpu_ub_cur == 0 && cur_token > 0) {
+        size_t i_compute_buf = lctx.i_compute_buf;
+        lctx.i_compute_buf = (lctx.i_compute_buf + 1) % lctx.n_compute_bufs;
+        if (i_compute_buf == 0 && cur_token > 0) {
            // sync all backends to ensure that the current buffer is not in use
            printf("not enough buffers, syncing now\n");
            ggml_backend_sched_synchronize(lctx.sched);
        }
-
-        ggml_tallocr_set_buffer(lctx.alloc_cpu, lctx.buf_cpu_ub.at(buf_cpu_ub_cur));
+        for (auto it : lctx.bufs_compute) {
+            ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(lctx.sched, it.first);
+            ggml_tallocr_set_buffer(alloc, it.second.at(i_compute_buf));
+        }

        ggml_backend_sched_reset(lctx.sched);

@ -6833,7 +6847,7 @@ static int llama_decode_internal(
    }

    ggml_backend_sched_synchronize(lctx.sched);
-    lctx.buf_cpu_ub_cur = 0;
+    lctx.i_compute_buf = 0;

    // measure the performance only for the single-token evals
    if (n_tokens_all == 1) {
@ -10003,14 +10017,26 @@ struct llama_context * llama_new_context_with_model(
            ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);

            // duplicate cpu buffers for microbatching
-            ggml_backend_buffer_t buf_cpu = ggml_tallocr_get_buffer(ctx->alloc_cpu);
-            size_t buf_size = ggml_backend_buffer_get_size(buf_cpu);
-            ctx->buf_cpu_ub.push_back(buf_cpu);
-            int n_ub = 64;
-            for (int i = 1; i < n_ub; ++i) {
-                ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_size);
-                ctx->buf_cpu_ub.push_back(buf);
+            const int n_ub = 16;
+            ctx->n_compute_bufs = n_ub;
+
+            for (ggml_backend_t b : ctx->backends) {
+                ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr(ctx->sched, b);
+                ggml_backend_buffer_t buf = ggml_tallocr_get_buffer(alloc);
+                size_t buf_size = ggml_backend_buffer_get_size(buf);
+                ctx->bufs_compute[b].push_back(buf);
+                auto * buft = ggml_backend_buffer_get_type(buf);
+                for (int i = 1; i < n_ub; ++i) {
+                    ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
+                    if (buf == nullptr) {
+                        LLAMA_LOG_ERROR("%s: failed to allocate compute buffer\n", __func__);
+                        llama_free(ctx);
+                        return nullptr;
+                    }
+                    ctx->bufs_compute[b].push_back(buf);
+                }
            }
+
            // allocate buffer for logits output
            ctx->buf_logits = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), hparams.n_vocab*cparams.n_ctx*sizeof(float));
            if (ctx->buf_logits == nullptr) {
@ -10816,13 +10842,13 @@ int32_t llama_decode(

 float * llama_get_logits(struct llama_context * ctx) {
    ggml_backend_sched_synchronize(ctx->sched);
-    ctx->buf_cpu_ub_cur = 0;
+    ctx->i_compute_buf = 0;
    return ctx->logits;
 }

 float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
    ggml_backend_sched_synchronize(ctx->sched);
-    ctx->buf_cpu_ub_cur = 0;
+    ctx->i_compute_buf = 0;

    assert(ctx->logits_valid.at(i));
    return ctx->logits + i*ctx->model.hparams.n_vocab;
@ -10830,7 +10856,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {

 float * llama_get_embeddings(struct llama_context * ctx) {
    ggml_backend_sched_synchronize(ctx->sched);
-    ctx->buf_cpu_ub_cur = 0;
+    ctx->i_compute_buf = 0;

    return ctx->embedding.data();
 }