Begin transition to backend v2

2024-02-05 17:19:45 -06:00 · 2024-02-05 17:19:45 -06:00 · b98274c76f
commit b98274c76f
parent aa166462f1
6 changed files with 233 additions and 32 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -426,7 +426,7 @@ if (LLAMA_MPI)
        message(STATUS "MPI found")

        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c)
+        set(GGML_SOURCES_MPI ggml-mpi.cpp)

        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
--- a/2
+++ b/2
@ -573,7 +573,7 @@ endif
 endif # LLAMA_METAL

 ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@ -1,11 +1,14 @@
 #include "ggml-mpi.h"

 #include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"

 #include <mpi.h>

 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>

 #define MIN(a, b) ((a) < (b) ? (a) : (b))

@ -17,6 +20,8 @@ struct ggml_mpi_context {
    MPI_Comm comm;
    int layer_start;
    int layer_end;
+    struct ggml_tensor *inp0;
+    std::string name;
 };

 void ggml_mpi_backend_init(void) {
@ -29,7 +34,7 @@ void ggml_mpi_backend_free(void) {
 }

 struct ggml_mpi_context * ggml_mpi_init(void) {
-    struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context));
+    auto * ctx = static_cast<ggml_mpi_context *>(calloc(1, sizeof(struct ggml_mpi_context)));

    MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank);
    MPI_Comm_size(MPI_COMM_WORLD, &ctx->size);
@ -39,7 +44,7 @@ struct ggml_mpi_context * ggml_mpi_init(void) {
 }

 struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) {
-    struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context));
+    auto * newCtx = static_cast<ggml_mpi_context *>(calloc(1, sizeof(struct ggml_mpi_context)));
    MPI_Comm_split(ctx->comm, color, key, &newCtx->comm);
    MPI_Comm_rank(newCtx->comm, &newCtx->rank);
    MPI_Comm_size(newCtx->comm, &newCtx->size);
@ -70,16 +75,16 @@ void ggml_mpi_eval_init(

    MPI_Barrier(ctx_mpi->comm);
    int32_t old_n_tokens = *n_tokens;
-    MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm);
+    MPI_Bcast(n_tokens, 1, MPI_INT32_T, 0, ctx_mpi->comm);

    // If what was passed in differs from what was broadcast,
    // we can't guarantee the allocated sizes are correct
    // TODO check how often this is done and if it's a problem,
    //      try to allocate ahead of time
    if (old_n_tokens != *n_tokens) {
-        *pos = realloc(*pos, *n_tokens * sizeof(int32_t));
-        *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t ));
-        *logits = realloc(*logits, *n_tokens * sizeof(int32_t));
+        *pos = static_cast<int32_t *>(realloc(*pos, *n_tokens * sizeof(int32_t)));
+        *n_seq_ids = static_cast<int32_t *>(realloc(*n_seq_ids, *n_tokens * sizeof(int32_t)));
+        *logits = static_cast<int8_t *>(realloc(*logits, *n_tokens * sizeof(int32_t)));
    }


@ -96,7 +101,7 @@ void ggml_mpi_eval_init(

    // MPI can't chase the pointers for multidimensional arrays, so we flatten them first
    // for transit
-    int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t));
+    auto * flattened_seq_ids = static_cast<int32_t *>(calloc(total_n_seq_ids, sizeof(int32_t)));

    int32_t current_index = 0;

@ -114,10 +119,10 @@ void ggml_mpi_eval_init(
    MPI_Bcast(             *pos, *n_tokens,        MPI_INT32_T, 0, ctx_mpi->comm);
    MPI_Bcast(flattened_seq_ids,  total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm);
    //MPI_Bcast(*logits,               *n_tokens,        MPI_INT8_T, 0, ctx_mpi->comm);
-    int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*));
+    auto ** new_seq_id = static_cast<int32_t **>(calloc(*n_tokens, sizeof(int32_t *)));
    current_index = 0;
    for (int32_t i = 0; i < *n_tokens; i++) {
-        new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t));
+        new_seq_id[i] = static_cast<int32_t *>(calloc((*n_seq_ids)[i], sizeof(int32_t)));
        for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) {
            new_seq_id[i][j] = flattened_seq_ids[current_index];
            current_index++;
@ -176,7 +181,7 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_C
    }

    MPI_Status status; UNUSED(status);
-
+    fprintf(stderr, "%s: tensor receive == null: %d\n", __func__, t->data == NULL);
    const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status);
    GGML_ASSERT(retval == MPI_SUCCESS);
 }
@ -241,11 +246,7 @@ void ggml_mpi_scatter_layers(
    fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end);
 }

-// TODO: there are many improvements that can be done to this implementation
-void ggml_mpi_graph_compute_pre(
-        struct ggml_mpi_context * ctx_mpi,
-             struct ggml_cgraph * gf,
-                            int   n_layers) {
+void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int   n_layers) {
    const int mpi_rank = ctx_mpi->rank;
    const int mpi_size = ctx_mpi->size;

@ -261,6 +262,8 @@ void ggml_mpi_graph_compute_pre(
        return;
    }

+    ctx_mpi->inp0 = inp0;
+
 //    fprintf(stderr, "gf->nodes[0] == %s\n", ggml_get_name(gf->nodes[0]));
 //
 //    GGML_ASSERT(inp0 == gf->nodes[0]);
@ -278,23 +281,11 @@ void ggml_mpi_graph_compute_pre(
    //


-
-    if (mpi_rank > 0) {
-        if (mpi_rank == 1) {
-            // the first node (1) receives the input tokens from the main node (0)
-            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
-        } else {
-            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
-            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
-        }
-    } else if (mpi_size > 1) {
-        // node 0 sends the input tokens to node 1
-        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
-
-        // recv the output data from the last node
-        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+    for (int i = 0; i < gf->n_nodes; i++) {
+        gf->nodes[i]->backend = GGML_BACKEND_MPI_SPLIT;
    }

+
    {


@ -347,6 +338,47 @@ void ggml_mpi_graph_compute_pre(
    }
 }

+// TODO: there are many improvements that can be done to this implementation
+void ggml_mpi_graph_compute_pre(
+        struct ggml_mpi_context * ctx_mpi,
+             struct ggml_cgraph * gf,
+                            int   n_layers) {
+    const int mpi_rank = ctx_mpi->rank;
+    const int mpi_size = ctx_mpi->size;
+
+    struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+    if (inp_tokens == NULL) {
+        fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
+        return;
+    }
+
+    struct ggml_tensor * inp0 = ctx_mpi->inp0;
+    if (inp0 == NULL) {
+        fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
+        return;
+    }
+
+    if (mpi_rank > 0) {
+        if (mpi_rank == 1) {
+            // the first node (1) receives the input tokens from the main node (0)
+            if (inp_tokens->data == NULL) {
+
+            }
+            ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm);
+        } else {
+            // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
+            fprintf(stderr, "%s:%d: receiving layer inp0\n", __func__, ctx_mpi->rank);
+            ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm);
+        }
+    } else if (mpi_size > 1) {
+        // node 0 sends the input tokens to node 1
+        ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm);
+
+        // recv the output data from the last node
+        ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm);
+    }
+}
+
 void ggml_mpi_graph_compute_post(
        struct ggml_mpi_context * ctx_mpi,
             struct ggml_cgraph * gf,
@ -361,3 +393,121 @@ void ggml_mpi_graph_compute_post(
        ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm);
    }
 }
+
+// BACKEND V2
+
+static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_mpi_free(ggml_backend_t backend) {
+    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+
+    delete ctx;
+
+
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_mpi_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_cpu_buffer_type();
+}
+
+GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_CPY:
+            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
+        case GGML_OP_MUL_MAT:
+            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        default:
+            return true;
+    }
+
+    GGML_UNUSED(backend);
+}
+
+static struct ggml_backend_i mpi_backend_i = {
+        /* .get_name                = */ ggml_backend_mpi_name,
+        /* .free                    = */ ggml_backend_mpi_free,
+        /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
+        /* .set_tensor_async        = */ NULL,
+        /* .get_tensor_async        = */ NULL,
+        /* .cpy_tensor_async        = */ NULL,
+        /* .synchronize             = */ NULL,
+        /* .graph_plan_create       = */ NULL,
+        /* .graph_plan_free         = */ NULL,
+        /* .graph_plan_compute      = */ NULL,
+        /* .graph_compute           = */ ggml_backend_graph_compute,
+        /* .supports_op             = */ ggml_backend_mpi_supports_op,
+};
+
+
+std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
+    static bool has_init = false;
+    if (!has_init) {
+        ggml_mpi_backend_init();
+        has_init = true;
+    }
+    std::vector<ggml_mpi_device> devices;
+    int s;
+    MPI_Comm_size(MPI_COMM_WORLD, &s);
+    devices.resize(s);
+    for (int i = 0; i < s; i++) {
+        devices[i] = ggml_mpi_device{
+                i,
+                ggml_mpi_init(),
+                ("MPI_COMM_WORLD:" + std::to_string(i)).c_str(),
+                1
+        };
+    }
+    return devices;
+}
+
+ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft) {
+    auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{
+            /* .iface    = */ buft->iface,
+            /* .context  = */ buft->context,
+    };
+
+    return ggml_backend_wrapped_buffer_type;
+}
+
+ggml_backend_t ggml_backend_mpi_init(int index) {
+    auto *mpi_backend = new ggml_backend {
+            /* .interface = */ mpi_backend_i,
+            /* .context   = */ ggml_mpi_init(),
+    };
+
+    return mpi_backend;
+}
+
+static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
+    GGML_UNUSED(params);
+    return ggml_backend_mpi_init(intptr_t(user_data));
+}
+
+
+
+ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) {
+    return ggml_backend_cpu_buffer_type();
+}
+
+extern "C" GGML_CALL int ggml_backend_mpi_reg_devices();
+
+int ggml_backend_mpi_reg_devices() {
+    auto devices = ggml_mpi_available_devices_internal();
+    for (const auto & device : devices) {
+        ggml_backend_register(
+                device.name,
+                ggml_backend_reg_mpi_init,
+                ggml_backend_mpi_buffer_type(device.index),
+                reinterpret_cast<void *>(intptr_t(device.index))
+        );
+    }
+    return devices.size();
+}
+
+
+
+
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@ -1,6 +1,8 @@
 #pragma once
 #include <stdint.h>
 #include <stddef.h>
+#include "ggml.h"
+#include "ggml-backend.h"

 struct ggml_context;
 struct ggml_tensor;
@ -49,6 +51,11 @@ void ggml_mpi_backend_free(void);
 */
 struct ggml_mpi_context * ggml_mpi_init(void);

+void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int   n_layers);
+
+GGML_API ggml_backend_t ggml_backend_mpi_init(int index);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);
+
 /**
 * Create a new context by splitting the given context's
 * communicator, creating a "sub-communicator." This is a collective
@ -194,6 +201,18 @@ void ggml_mpi_graph_compute_post(
             struct ggml_cgraph * gf,
                            int   n_layers);

+// BACKEND V2
+
+struct ggml_mpi_device {
+    int index;
+    struct ggml_mpi_context * ctx_mpi;
+    const char * name;
+    int subgroupSize;
+};
+
+#define MPI_BACKEND_NAME "MPI"
+GGML_CALL int ggml_backend_mpi_reg_devices();
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml.h
+++ b/ggml.h
@ -379,6 +379,7 @@ extern "C" {
        GGML_BACKEND_TYPE_CPU = 0,
        GGML_BACKEND_TYPE_GPU = 10,
        GGML_BACKEND_TYPE_GPU_SPLIT = 20,
+        GGML_BACKEND_TYPE_MPI_SPLIT = 30,
    };

    // model file types
--- a/llama.cpp
+++ b/llama.cpp
@ -4089,6 +4089,18 @@ static bool llm_load_tensors(
        }
    }

+#ifdef GGML_USE_MPI
+    for (int64_t i = 0; i < n_layer; i++) {
+        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
+                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
+    }
+
+    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
+                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
+    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
+                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+#endif
+
    // count used buffer types
    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
    buft_layer_count[model.buft_input.buft]++;
@ -4965,6 +4977,12 @@ static bool llm_load_tensors(
                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
            }
        }
+
+#ifdef GGML_USE_MPI
+        if (buf == nullptr) {
+            continue;
+        }
+#endif
        if (buf == nullptr) {
            throw std::runtime_error("failed to allocate buffer");
        }
@ -12978,6 +12996,19 @@ struct llama_context * llama_new_context_with_model(
            }
            ctx->backends.push_back(backend);
        }
+#endif
+
+#ifdef GGML_USE_MPI
+        // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
+        ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu);
+        if (backend == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
+            llama_free(ctx);
+            return nullptr;
+        }
+        ctx->backends.push_back(backend);
+
+
 #endif
        ctx->backend_cpu = ggml_backend_cpu_init();
        if (ctx->backend_cpu == nullptr) {