From 968cefb4a9c430c09b2b7a4df9dbc24b74efe593 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 19 Feb 2024 12:21:48 -0600
Subject: [PATCH] Wrap backends with MPI backend

---
 ggml-mpi.cpp | 124 +++++++++++++++++++--------------------------------
 ggml-mpi.h   |   9 ++--
 llama.cpp    |  50 +++++++++------------
 llama.h      |   3 ++
 4 files changed, 74 insertions(+), 112 deletions(-)

diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index b43dd96d1..3d2fc829e 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -22,6 +22,7 @@ struct ggml_mpi_context {
     int layer_end;
     struct ggml_tensor *inp0;
     std::string name;
+    struct ggml_backend * wrapped_backend;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -247,8 +248,6 @@ void ggml_mpi_scatter_layers(
 }
 
 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int   n_layers) {
-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;
 
     struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
     if (inp_tokens == NULL) {
@@ -286,73 +285,22 @@ void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml
     }
 
 
-    {
-
-
-        //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
-
-        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
-
-        //const int il0 =               (mpi_idx + 0) * n_per_node;
-        //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
-        int il0 = ctx_mpi->layer_start;
-        int il1 = MIN(n_layers, ctx_mpi->layer_end);
-
-        char name_l0[GGML_MAX_NAME];
-        char name_l1[GGML_MAX_NAME];
-
-        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
-        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
-
-        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
-        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
-
-        if (idx_l0 < 0 || idx_l1 < 0) {
-            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
-            return;
-        }
-
-        // attach the input data to all nodes that need it
-        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
-        for (int i = idx_l0; i < idx_l1; i++) {
-            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[0] =  inp0;
-            }
-            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[1] =  inp0;
-            }
-        }
-
-        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
-        for (int i = 1; i < idx_l1 - idx_l0; i++) {
-            gf->nodes[i] = gf->nodes[idx_l0 + i];
-        }
-
-        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
-        if (mpi_idx != 0) {
-            gf->nodes[0]->op = GGML_OP_NONE;
-        }
-
-        gf->n_nodes = idx_l1 - idx_l0;
-
-    }
 }
 
 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
+             struct ggml_cgraph * gf) {
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
-    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    struct ggml_tensor * inp_tokens = gf->nodes[0];
     if (inp_tokens == NULL) {
         fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
         return;
     }
 
-    struct ggml_tensor * inp0 = ctx_mpi->inp0;
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
     if (inp0 == NULL) {
         fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
         return;
@@ -381,9 +329,7 @@ void ggml_mpi_graph_compute_pre(
 
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    UNUSED(n_layers);
+             struct ggml_cgraph * gf) {
 
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
@@ -396,9 +342,24 @@ void ggml_mpi_graph_compute_post(
 
 // BACKEND V2
 
+GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+
+    struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
+
+    ggml_mpi_graph_compute_pre(ctx, cgraph);
+
+    ggml_backend_t wrapped_backend = ctx->wrapped_backend;
+    bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
+
+    ggml_mpi_graph_compute_post(ctx, cgraph);
+
+    return ret;
+}
+
+
 static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
     auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
-    return ctx->name.c_str();
+    return ctx->wrapped_backend->iface.get_name(backend);
 }
 
 static void ggml_backend_mpi_free(ggml_backend_t backend) {
@@ -427,20 +388,6 @@ GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const
     GGML_UNUSED(backend);
 }
 
-static struct ggml_backend_i mpi_backend_i = {
-        /* .get_name                = */ ggml_backend_mpi_name,
-        /* .free                    = */ ggml_backend_mpi_free,
-        /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
-        /* .set_tensor_async        = */ NULL,
-        /* .get_tensor_async        = */ NULL,
-        /* .cpy_tensor_async        = */ NULL,
-        /* .synchronize             = */ NULL,
-        /* .graph_plan_create       = */ NULL,
-        /* .graph_plan_free         = */ NULL,
-        /* .graph_plan_compute      = */ NULL,
-        /* .graph_compute           = */ ggml_backend_graph_compute,
-        /* .supports_op             = */ ggml_backend_mpi_supports_op,
-};
 
 
 std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
@@ -473,23 +420,42 @@ ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type
     return ggml_backend_wrapped_buffer_type;
 }
 
-ggml_backend_t ggml_backend_mpi_init(int index) {
+ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend) {
+
+    struct ggml_backend_i mpi_backend_i = {
+            /* .get_name                = */ wrapped_backend->iface.get_name,
+            /* .free                    = */ ggml_backend_mpi_free,
+            /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
+            /* .set_tensor_async        = */ NULL,
+            /* .get_tensor_async        = */ NULL,
+            /* .cpy_tensor_async        = */ NULL,
+            /* .synchronize             = */ NULL,
+            /* .graph_plan_create       = */ NULL,
+            /* .graph_plan_free         = */ NULL,
+            /* .graph_plan_compute      = */ NULL,
+            /* .graph_compute           = */ ggml_backend_mpi_graph_compute,
+            /* .supports_op             = */ ggml_backend_mpi_supports_op,
+    };
+
+    ggml_mpi_context * ctx = ggml_mpi_init();
+    ctx->wrapped_backend = wrapped_backend;
     auto *mpi_backend = new ggml_backend {
             /* .interface = */ mpi_backend_i,
-            /* .context   = */ ggml_mpi_init(),
+            /* .context   = */ ctx,
     };
 
     return mpi_backend;
 }
 
 static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
+    // TODO check what the parameters are for. Could use it to setup the MPI comms and routes?
     GGML_UNUSED(params);
-    return ggml_backend_mpi_init(intptr_t(user_data));
+    return ggml_backend_mpi_init(ggml_backend_cpu_init());
 }
 
 
 
-ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) {
+ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type() {
     return ggml_backend_cpu_buffer_type();
 }
 
@@ -501,7 +467,7 @@ int ggml_backend_mpi_reg_devices() {
         ggml_backend_register(
                 device.name,
                 ggml_backend_reg_mpi_init,
-                ggml_backend_mpi_buffer_type(device.index),
+                ggml_backend_mpi_buffer_type(),
                 reinterpret_cast<void *>(intptr_t(device.index))
         );
     }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 2a0c5809c..c72ec0444 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -53,7 +53,6 @@ struct ggml_mpi_context * ggml_mpi_init(void);
 
 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
 
-GGML_API ggml_backend_t ggml_backend_mpi_init(int index);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);
 
 /**
@@ -185,8 +184,7 @@ void ggml_mpi_scatter_layers(
  */
 void ggml_mpi_graph_compute_pre(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);
 
 /**
  * Sends the output tensor to the next node for processing
@@ -198,8 +196,7 @@ void ggml_mpi_graph_compute_pre(
  */
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);
 
 // BACKEND V2
 
@@ -213,6 +210,8 @@ struct ggml_mpi_device {
 #define MPI_BACKEND_NAME "MPI"
 GGML_CALL int ggml_backend_mpi_reg_devices();
 
+GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llama.cpp b/llama.cpp
index 444c99e58..edf2a03cf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4090,15 +4090,15 @@ static bool llm_load_tensors(
     }
 
 #ifdef GGML_USE_MPI
-    for (int64_t i = 0; i < n_layer; i++) {
-        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
-                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
-    }
-
-    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
-                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
-    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
-                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+//    for (int64_t i = 0; i < n_layer; i++) {
+//        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
+//                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
+//    }
+//
+//    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
+//                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
+//    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
+//                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
 #endif
 
     // count used buffer types
@@ -8764,10 +8764,7 @@ static void llama_graph_compute(
         llama_context & lctx,
           ggml_cgraph * gf,
                   int   n_threads) {
-#ifdef GGML_USE_MPI
-    const int64_t n_layer = lctx.model.hparams.n_layer;
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
+
 
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(lctx.backend_metal)) {
@@ -8783,10 +8780,7 @@ static void llama_graph_compute(
     ggml_backend_sched_graph_compute_async(lctx.sched, gf);
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
-#endif
+    
 }
 
 // decode a batch of tokens by evaluating the transformer
@@ -12619,6 +12613,7 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
+            static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -12998,18 +12993,7 @@ struct llama_context * llama_new_context_with_model(
         }
 #endif
 
-#ifdef GGML_USE_MPI
-        // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
-        ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu);
-        if (backend == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
-            llama_free(ctx);
-            return nullptr;
-        }
-        ctx->backends.push_back(backend);
 
-
-#endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@@ -13018,6 +13002,16 @@ struct llama_context * llama_new_context_with_model(
         }
         ctx->backends.push_back(ctx->backend_cpu);
 
+#ifdef GGML_USE_MPI
+
+        for(auto & backend : ctx->backends) {
+            backend = ggml_backend_mpi_init(backend);
+
+        }
+
+        ctx->backend_cpu = ctx->backends.back();
+#endif
+
         if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
diff --git a/llama.h b/llama.h
index 818056064..2f2e775ca 100644
--- a/llama.h
+++ b/llama.h
@@ -202,6 +202,9 @@ extern "C" {
     };
 
     struct llama_model_params {
+        // Array of layers to allocate to each node
+        int32_t* n_node_layers;
+
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs