From b98274c76fa5c1b534237fd5327aae4a7435b0c5 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Mon, 5 Feb 2024 17:19:45 -0600
Subject: [PATCH] Begin transition to backend v2

---
 CMakeLists.txt             |   2 +-
 Makefile                   |   2 +-
 ggml-mpi.c => ggml-mpi.cpp | 210 +++++++++++++++++++++++++++++++------
 ggml-mpi.h                 |  19 ++++
 ggml.h                     |   1 +
 llama.cpp                  |  31 ++++++
 6 files changed, 233 insertions(+), 32 deletions(-)
 rename ggml-mpi.c => ggml-mpi.cpp (67%)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ac2804a6..b805650d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -426,7 +426,7 @@ if (LLAMA_MPI)
         message(STATUS "MPI found")
 
         set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp)
 
         add_compile_definitions(GGML_USE_MPI)
         add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
diff --git a/Makefile b/Makefile
index cb597b209..f8ad9f146 100644
--- a/Makefile
+++ b/Makefile
@@ -573,7 +573,7 @@ endif
 endif # LLAMA_METAL
 
 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
diff --git a/ggml-mpi.c b/ggml-mpi.cpp
similarity index 67%
rename from ggml-mpi.c
rename to ggml-mpi.cpp
index c10faa252..b43dd96d1 100644
--- a/ggml-mpi.c
+++ b/ggml-mpi.cpp
@@ -1,11 +1,14 @@
 #include "ggml-mpi.h"
 
 #include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
 
 #include <mpi.h>
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
@@ -17,6 +20,8 @@ struct ggml_mpi_context {
     MPI_Comm comm;
     int layer_start;
     int layer_end;
+    struct ggml_tensor *inp0;
+    std::string name;
 };
 
 void ggml_mpi_backend_init(void) {
@@ -29,7 +34,7 @@ void ggml_mpi_backend_free(void) {
 }
 
 struct ggml_mpi_context * ggml_mpi_init(void) {
-    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
+    auto * ctx = static_cast<ggml_mpi_context *>(calloc(1, sizeof(struct ggml_mpi_context)));
 
     MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
     MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
@@ -39,7 +44,7 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
 }
 
 struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) {
-    struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context));
+    auto * newCtx = static_cast<ggml_mpi_context *>(calloc(1, sizeof(struct ggml_mpi_context)));
     MPI_Comm_split(ctx->comm, color, key, &newCtx->comm);
     MPI_Comm_rank(newCtx->comm, &newCtx->rank);
     MPI_Comm_size(newCtx->comm, &newCtx->size);
@@ -70,16 +75,16 @@ void ggml_mpi_eval_init(
 
     MPI_Barrier(ctx_mpi->comm);
     int32_t old_n_tokens = *n_tokens;
-    MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_tokens, 1, MPI_INT32_T, 0, ctx_mpi->comm);
 
     // If what was passed in differs from what was broadcast,
     // we can't guarantee the allocated sizes are correct
     // TODO check how often this is done and if it's a problem,
     //      try to allocate ahead of time
     if (old_n_tokens != *n_tokens) {
-        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
-        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
-        *logits = realloc(*logits, *n_tokens * sizeof(int32_t));
+        *pos = static_cast<int32_t *>(realloc(*pos, *n_tokens * sizeof(int32_t)));
+        *n_seq_ids = static_cast<int32_t *>(realloc(*n_seq_ids, *n_tokens * sizeof(int32_t)));
+        *logits = static_cast<int8_t *>(realloc(*logits, *n_tokens * sizeof(int32_t)));
     }
 
 
@@ -96,7 +101,7 @@ void ggml_mpi_eval_init(
 
     // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
     // for transit
-    int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t));
+    auto * flattened_seq_ids = static_cast<int32_t *>(calloc(total_n_seq_ids, sizeof(int32_t)));
 
     int32_t current_index = 0;
 
@@ -114,10 +119,10 @@ void ggml_mpi_eval_init(
     MPI_Bcast(             *pos, *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
     MPI_Bcast(flattened_seq_ids,  total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
     //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
-    int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
+    auto ** new_seq_id = static_cast<int32_t **>(calloc(*n_tokens, sizeof(int32_t *)));
     current_index = 0;
     for (int32_t i = 0; i < *n_tokens; i++) {
-        new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
+        new_seq_id[i] = static_cast<int32_t *>(calloc((*n_seq_ids)[i], sizeof(int32_t)));
         for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
             new_seq_id[i][j] = flattened_seq_ids[current_index];
             current_index++;
@@ -176,7 +181,7 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_C
     }
 
     MPI_Status status; UNUSED(status);
-
+    fprintf(stderr, "%s: tensor receive == null: %d\n", __func__, t->data == NULL);
     const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
     GGML_ASSERT(retval == MPI_SUCCESS);
 }
@@ -241,11 +246,7 @@ void ggml_mpi_scatter_layers(
     fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
 }
 
-// TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
+void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int   n_layers) {
     const int mpi_rank = ctx_mpi->rank;
     const int mpi_size = ctx_mpi->size;
 
@@ -261,6 +262,8 @@ void ggml_mpi_graph_compute_pre(
         return;
     }
 
+    ctx_mpi->inp0 = inp0;
+
 //    fprintf(stderr, "gf->nodes[0] == %s\n", ggml_get_name(gf->nodes[0]));
 //
 //    GGML_ASSERT(inp0 == gf->nodes[0]);
@@ -278,23 +281,11 @@ void ggml_mpi_graph_compute_pre(
     //
 
 
-
-    if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
-        }
-    } else if (mpi_size > 1) {
-        // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
-
-        // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+    for (int i = 0; i < gf->n_nodes; i++) {
+        gf->nodes[i]->backend = GGML_BACKEND_MPI_SPLIT;
     }
 
+
     {
 
 
@@ -347,6 +338,47 @@ void ggml_mpi_graph_compute_pre(
     }
 }
 
+// TODO: there are many improvements that can be done to this implementation
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ctx_mpi->inp0;
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    if (mpi_rank > 0) {
+        if (mpi_rank == 1) {
+            // the first node (1) receives the input tokens from the main node (0)
+            if (inp_tokens->data == NULL) {
+
+            }
+            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
+        } else {
+            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            fprintf(stderr, "%s:%d: receiving layer inp0\n", __func__, ctx_mpi->rank);
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
+        }
+    } else if (mpi_size > 1) {
+        // node 0 sends the input tokens to node 1
+        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
+
+        // recv the output data from the last node
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+    }
+}
+
 void ggml_mpi_graph_compute_post(
         struct ggml_mpi_context * ctx_mpi,
              struct ggml_cgraph * gf,
@@ -361,3 +393,121 @@ void ggml_mpi_graph_compute_post(
         ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
     }
 }
+
+// BACKEND V2
+
+static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_mpi_free(ggml_backend_t backend) {
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+
+    delete ctx;
+
+
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_mpi_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_cpu_buffer_type();
+}
+
+GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_CPY:
+            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
+        case GGML_OP_MUL_MAT:
+            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        default:
+            return true;
+    }
+
+    GGML_UNUSED(backend);
+}
+
+static struct ggml_backend_i mpi_backend_i = {
+        /* .get_name                = */ ggml_backend_mpi_name,
+        /* .free                    = */ ggml_backend_mpi_free,
+        /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
+        /* .set_tensor_async        = */ NULL,
+        /* .get_tensor_async        = */ NULL,
+        /* .cpy_tensor_async        = */ NULL,
+        /* .synchronize             = */ NULL,
+        /* .graph_plan_create       = */ NULL,
+        /* .graph_plan_free         = */ NULL,
+        /* .graph_plan_compute      = */ NULL,
+        /* .graph_compute           = */ ggml_backend_graph_compute,
+        /* .supports_op             = */ ggml_backend_mpi_supports_op,
+};
+
+
+std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
+    static bool has_init = false;
+    if (!has_init) {
+        ggml_mpi_backend_init();
+        has_init = true;
+    }
+    std::vector<ggml_mpi_device> devices;
+    int s;
+    MPI_Comm_size(MPI_COMM_WORLD, &s);
+    devices.resize(s);
+    for (int i = 0; i < s; i++) {
+        devices[i] = ggml_mpi_device{
+                i,
+                ggml_mpi_init(),
+                ("MPI_COMM_WORLD:" + std::to_string(i)).c_str(),
+                1
+        };
+    }
+    return devices;
+}
+
+ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft) {
+    auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{
+            /* .iface    = */ buft->iface,
+            /* .context  = */ buft->context,
+    };
+
+    return ggml_backend_wrapped_buffer_type;
+}
+
+ggml_backend_t ggml_backend_mpi_init(int index) {
+    auto *mpi_backend = new ggml_backend {
+            /* .interface = */ mpi_backend_i,
+            /* .context   = */ ggml_mpi_init(),
+    };
+
+    return mpi_backend;
+}
+
+static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
+    GGML_UNUSED(params);
+    return ggml_backend_mpi_init(intptr_t(user_data));
+}
+
+
+
+ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) {
+    return ggml_backend_cpu_buffer_type();
+}
+
+extern "C" GGML_CALL int ggml_backend_mpi_reg_devices();
+
+int ggml_backend_mpi_reg_devices() {
+    auto devices = ggml_mpi_available_devices_internal();
+    for (const auto & device : devices) {
+        ggml_backend_register(
+                device.name,
+                ggml_backend_reg_mpi_init,
+                ggml_backend_mpi_buffer_type(device.index),
+                reinterpret_cast<void *>(intptr_t(device.index))
+        );
+    }
+    return devices.size();
+}
+
+
+
+
diff --git a/ggml-mpi.h b/ggml-mpi.h
index 62b15faef..2a0c5809c 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -1,6 +1,8 @@
 #pragma once
 #include <stdint.h>
 #include <stddef.h>
+#include "ggml.h"
+#include "ggml-backend.h"
 
 struct ggml_context;
 struct ggml_tensor;
@@ -49,6 +51,11 @@ void ggml_mpi_backend_free(void);
  */
 struct ggml_mpi_context * ggml_mpi_init(void);
 
+void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
+
+GGML_API ggml_backend_t ggml_backend_mpi_init(int index);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);
+
 /**
  * Create a new context by splitting the given context's
  * communicator, creating a "sub-communicator." This is a collective
@@ -194,6 +201,18 @@ void ggml_mpi_graph_compute_post(
              struct ggml_cgraph * gf,
                             int   n_layers);
 
+// BACKEND V2
+
+struct ggml_mpi_device {
+    int index;
+    struct ggml_mpi_context * ctx_mpi;
+    const char * name;
+    int subgroupSize;
+};
+
+#define MPI_BACKEND_NAME "MPI"
+GGML_CALL int ggml_backend_mpi_reg_devices();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml.h b/ggml.h
index ab26c8f59..a4efe792d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -379,6 +379,7 @@ extern "C" {
         GGML_BACKEND_TYPE_CPU = 0,
         GGML_BACKEND_TYPE_GPU = 10,
         GGML_BACKEND_TYPE_GPU_SPLIT = 20,
+        GGML_BACKEND_TYPE_MPI_SPLIT = 30,
     };
 
     // model file types
diff --git a/llama.cpp b/llama.cpp
index 2bdb38434..444c99e58 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4089,6 +4089,18 @@ static bool llm_load_tensors(
         }
     }
 
+#ifdef GGML_USE_MPI
+    for (int64_t i = 0; i < n_layer; i++) {
+        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
+                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
+    }
+
+    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
+                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
+    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
+                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+#endif
+
     // count used buffer types
     std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
     buft_layer_count[model.buft_input.buft]++;
@@ -4965,6 +4977,12 @@ static bool llm_load_tensors(
                 mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
             }
         }
+
+#ifdef GGML_USE_MPI
+        if (buf == nullptr) {
+            continue;
+        }
+#endif
         if (buf == nullptr) {
             throw std::runtime_error("failed to allocate buffer");
         }
@@ -12978,6 +12996,19 @@ struct llama_context * llama_new_context_with_model(
             }
             ctx->backends.push_back(backend);
         }
+#endif
+
+#ifdef GGML_USE_MPI
+        // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
+        ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu);
+        if (backend == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
+            llama_free(ctx);
+            return nullptr;
+        }
+        ctx->backends.push_back(backend);
+
+
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {