Change requirement of last backend being CPU to requiring its default buffer type be a host buffer, fix rebase errors

2024-03-14 22:24:54 -05:00 · 2024-03-14 22:24:54 -05:00 · 2217b02c99
commit 2217b02c99
parent e8a61568e9
3 changed files with 16 additions and 11 deletions
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1696,7 +1696,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
        bool parallel) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
-    GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
+    GGML_ASSERT(ggml_backend_buft_is_host(ggml_backend_get_default_buffer_type(backends[n_backends - 1]))); // last backend must be host

    struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);

--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@ -548,7 +548,7 @@ GGML_CALL static enum ggml_status ggml_backend_mpi_graph_compute(ggml_backend_t

    if (!ctx->remote) {
        ggml_backend_sched_t sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(),
-                                                            (int) ctx->backends.size(), cgraph->n_nodes);
+                                                            (int) ctx->backends.size(), cgraph->n_nodes, false);

        ggml_backend_sched_reserve(sched, cgraph);
        ggml_backend_sched_graph_compute(sched, cgraph);
@ -850,7 +850,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
    return buffer;
 }

-bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
 //    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
 //    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
 //
@ -870,7 +870,8 @@ bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml
 //    } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
 //        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
 //    }
-    return true;
+//    fprintf(stderr, "ATTEMPTING ASYNC COPY FOR SRC TENSOR %s TO DST TENSOR %s WITH SRC BACKEND %s AND DST BACKEND %s\n", src->name, dst->name, ggml_backend_name(backend_src), ggml_backend_name(backend_dst));
+    return false;

 }

--- a/llama.cpp
+++ b/llama.cpp
@ -9012,13 +9012,15 @@ static int llama_decode_internal(
        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
        //}

+#ifdef GGML_USE_MPI
+        if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
+#endif
+
        // extract logits
        // TODO: do not compute and extract logits if only embeddings are needed
        //       update the graphs to skip "result_output" if logits are not needed
        if (res) {
-    #ifdef GGML_USE_MPI
-        if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
-#endif
+

        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
            GGML_ASSERT(backend_res != nullptr);
@ -9104,6 +9106,10 @@ static int llama_decode_internal(
                    } break;
            }
        }
+
+#ifdef GGML_USE_MPI
+        }
+#endif
    }

    // wait for the computation to finish (automatically done when obtaining the model output)
@ -9121,9 +9127,7 @@ static int llama_decode_internal(
        }
    }

-#ifdef GGML_USE_MPI
-    }
-#endif
+

    return 0;
 }
@ -13051,7 +13055,7 @@ struct llama_context * llama_new_context_with_model(


 //        ctx->backend_cpu = ctx->backends.back();
-        ctx->backends.push_back(ctx->backend_cpu);
+        ctx->backends.push_back(ggml_backend_mpi_init(&ctx->backend_cpu, 1, ggml_mpi_rank(model->ctx_mpi)));

 #endif