Wrap backends with MPI backend

2024-02-19 12:21:48 -06:00 · 2024-02-19 12:21:48 -06:00 · 968cefb4a9
commit 968cefb4a9
parent b98274c76f
4 changed files with 74 additions and 112 deletions
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@ -22,6 +22,7 @@ struct ggml_mpi_context {
    int layer_end;
    struct ggml_tensor *inp0;
    std::string name;
+    struct ggml_backend * wrapped_backend;
 };

 void ggml_mpi_backend_init(void) {
@ -247,8 +248,6 @@ void ggml_mpi_scatter_layers(
 }

 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int   n_layers) {
-    const int mpi_rank = ctx_mpi->rank;
-    const int mpi_size = ctx_mpi->size;

    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
    if (inp_tokens == NULL) {
@ -286,73 +285,22 @@ void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml
    }


-    {
-
-
-        //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
-
-        const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
-
-        //const int il0 =               (mpi_idx + 0) * n_per_node;
-        //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
-        int il0 = ctx_mpi->layer_start;
-        int il1 = MIN(n_layers, ctx_mpi->layer_end);
-
-        char name_l0[GGML_MAX_NAME];
-        char name_l1[GGML_MAX_NAME];
-
-        snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
-        snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
-
-        const int idx_l0 =                ggml_graph_get_node_idx(gf, name_l0);
-        const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
-
-        if (idx_l0 < 0 || idx_l1 < 0) {
-            fprintf(stderr, "%s: layer input nodes not found\n", __func__);
-            return;
-        }
-
-        // attach the input data to all nodes that need it
-        // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
-        for (int i = idx_l0; i < idx_l1; i++) {
-            if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[0] =  inp0;
-            }
-            if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
-                gf->nodes[i]->src[1] =  inp0;
-            }
-        }
-
-        // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
-        for (int i = 1; i < idx_l1 - idx_l0; i++) {
-            gf->nodes[i] = gf->nodes[idx_l0 + i];
-        }
-
-        // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
-        if (mpi_idx != 0) {
-            gf->nodes[0]->op = GGML_OP_NONE;
-        }
-
-        gf->n_nodes = idx_l1 - idx_l0;
-
-    }
 }

 // TODO: there are many improvements that can be done to this implementation
 void ggml_mpi_graph_compute_pre(
        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
+             struct ggml_cgraph * gf) {
    const int mpi_rank = ctx_mpi->rank;
    const int mpi_size = ctx_mpi->size;

-    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    struct ggml_tensor * inp_tokens = gf->nodes[0];
    if (inp_tokens == NULL) {
        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
        return;
    }

-    struct ggml_tensor * inp0 = ctx_mpi->inp0;
+    struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
    if (inp0 == NULL) {
        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
        return;
@ -381,9 +329,7 @@ void ggml_mpi_graph_compute_pre(

 void ggml_mpi_graph_compute_post(
        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
-    UNUSED(n_layers);
+             struct ggml_cgraph * gf) {

    const int mpi_rank = ctx_mpi->rank;
    const int mpi_size = ctx_mpi->size;
@ -396,9 +342,24 @@ void ggml_mpi_graph_compute_post(

 // BACKEND V2

+GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+
+    struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
+
+    ggml_mpi_graph_compute_pre(ctx, cgraph);
+
+    ggml_backend_t wrapped_backend = ctx->wrapped_backend;
+    bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
+
+    ggml_mpi_graph_compute_post(ctx, cgraph);
+
+    return ret;
+}
+
+
 static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
-    return ctx->name.c_str();
+    return ctx->wrapped_backend->iface.get_name(backend);
 }

 static void ggml_backend_mpi_free(ggml_backend_t backend) {
@ -427,20 +388,6 @@ GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const
    GGML_UNUSED(backend);
 }

-static struct ggml_backend_i mpi_backend_i = {
-        /* .get_name                = */ ggml_backend_mpi_name,
-        /* .free                    = */ ggml_backend_mpi_free,
-        /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
-        /* .set_tensor_async        = */ NULL,
-        /* .get_tensor_async        = */ NULL,
-        /* .cpy_tensor_async        = */ NULL,
-        /* .synchronize             = */ NULL,
-        /* .graph_plan_create       = */ NULL,
-        /* .graph_plan_free         = */ NULL,
-        /* .graph_plan_compute      = */ NULL,
-        /* .graph_compute           = */ ggml_backend_graph_compute,
-        /* .supports_op             = */ ggml_backend_mpi_supports_op,
-};


 std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
@ -473,23 +420,42 @@ ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type
    return ggml_backend_wrapped_buffer_type;
 }

-ggml_backend_t ggml_backend_mpi_init(int index) {
+ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend) {
+
+    struct ggml_backend_i mpi_backend_i = {
+            /* .get_name                = */ wrapped_backend->iface.get_name,
+            /* .free                    = */ ggml_backend_mpi_free,
+            /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
+            /* .set_tensor_async        = */ NULL,
+            /* .get_tensor_async        = */ NULL,
+            /* .cpy_tensor_async        = */ NULL,
+            /* .synchronize             = */ NULL,
+            /* .graph_plan_create       = */ NULL,
+            /* .graph_plan_free         = */ NULL,
+            /* .graph_plan_compute      = */ NULL,
+            /* .graph_compute           = */ ggml_backend_mpi_graph_compute,
+            /* .supports_op             = */ ggml_backend_mpi_supports_op,
+    };
+
+    ggml_mpi_context * ctx = ggml_mpi_init();
+    ctx->wrapped_backend = wrapped_backend;
    auto *mpi_backend = new ggml_backend {
            /* .interface = */ mpi_backend_i,
-            /* .context   = */ ggml_mpi_init(),
+            /* .context   = */ ctx,
    };

    return mpi_backend;
 }

 static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
+    // TODO check what the parameters are for. Could use it to setup the MPI comms and routes?
    GGML_UNUSED(params);
-    return ggml_backend_mpi_init(intptr_t(user_data));
+    return ggml_backend_mpi_init(ggml_backend_cpu_init());
 }



-ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) {
+ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type() {
    return ggml_backend_cpu_buffer_type();
 }

@ -501,7 +467,7 @@ int ggml_backend_mpi_reg_devices() {
        ggml_backend_register(
                device.name,
                ggml_backend_reg_mpi_init,
-                ggml_backend_mpi_buffer_type(device.index),
+                ggml_backend_mpi_buffer_type(),
                reinterpret_cast<void *>(intptr_t(device.index))
        );
    }
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@ -53,7 +53,6 @@ struct ggml_mpi_context * ggml_mpi_init(void);

 void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);

-GGML_API ggml_backend_t ggml_backend_mpi_init(int index);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);

 /**
@ -185,8 +184,7 @@ void ggml_mpi_scatter_layers(
 */
 void ggml_mpi_graph_compute_pre(
        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);

 /**
 * Sends the output tensor to the next node for processing
@ -198,8 +196,7 @@ void ggml_mpi_graph_compute_pre(
 */
 void ggml_mpi_graph_compute_post(
        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers);
+             struct ggml_cgraph * gf);

 // BACKEND V2

@ -213,6 +210,8 @@ struct ggml_mpi_device {
 #define MPI_BACKEND_NAME "MPI"
 GGML_CALL int ggml_backend_mpi_reg_devices();

+GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend);
+
 #ifdef __cplusplus
 }
 #endif
--- a/llama.cpp
+++ b/llama.cpp
@ -4090,15 +4090,15 @@ static bool llm_load_tensors(
    }

 #ifdef GGML_USE_MPI
-    for (int64_t i = 0; i < n_layer; i++) {
-        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
-                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
-    }
-
-    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
-                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
-    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
-                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+//    for (int64_t i = 0; i < n_layer; i++) {
+//        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
+//                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
+//    }
+//
+//    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
+//                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
+//    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
+//                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
 #endif

    // count used buffer types
@ -8764,10 +8764,7 @@ static void llama_graph_compute(
        llama_context & lctx,
          ggml_cgraph * gf,
                  int   n_threads) {
-#ifdef GGML_USE_MPI
-    const int64_t n_layer = lctx.model.hparams.n_layer;
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
-#endif
+

 #ifdef GGML_USE_METAL
    if (ggml_backend_is_metal(lctx.backend_metal)) {
@ -8784,9 +8781,6 @@ static void llama_graph_compute(

    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
    
-#ifdef GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
-#endif
 }

 // decode a batch of tokens by evaluating the transformer
@ -12619,6 +12613,7 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
    struct llama_model_params result = {
+            static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
        /*.n_gpu_layers                =*/ 0,
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
@ -12998,18 +12993,7 @@ struct llama_context * llama_new_context_with_model(
        }
 #endif

-#ifdef GGML_USE_MPI
-        // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
-        ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu);
-        if (backend == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
-            llama_free(ctx);
-            return nullptr;
-        }
-        ctx->backends.push_back(backend);

-
-#endif
        ctx->backend_cpu = ggml_backend_cpu_init();
        if (ctx->backend_cpu == nullptr) {
            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@ -13018,6 +13002,16 @@ struct llama_context * llama_new_context_with_model(
        }
        ctx->backends.push_back(ctx->backend_cpu);

+#ifdef GGML_USE_MPI
+
+        for(auto & backend : ctx->backends) {
+            backend = ggml_backend_mpi_init(backend);
+
+        }
+
+        ctx->backend_cpu = ctx->backends.back();
+#endif
+
        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
--- a/llama.h
+++ b/llama.h
@ -202,6 +202,9 @@ extern "C" {
    };

    struct llama_model_params {
+        // Array of layers to allocate to each node
+        int32_t* n_node_layers;
+
        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs