From 4692644ff9262c0db9d941c691194ec0e05bcce0 Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Wed, 13 Mar 2024 01:38:38 -0500
Subject: [PATCH] Remove hard-coded layer splits and support more than 2 nodes

---
 common/common.cpp |  4 +++
 ggml-mpi.cpp      | 66 ++++++++++++++++++++++++-----------------------
 ggml-mpi.h        |  2 +-
 ggml.h            |  2 +-
 llama.cpp         | 59 ++++++++++++++++++++++++------------------
 llama.h           |  2 +-
 6 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 46ec366b0..4a8e93cc5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1319,6 +1319,10 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
         mparams.kv_overrides = params.kv_overrides.data();
     }
 
+    free((void *) mparams.node_layer_weights);
+
+    mparams.node_layer_weights = params.mpi_layer_split.data();
+
     return mparams;
 }
 
diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp
index 30e74c8bc..95dcb0fd3 100644
--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@@ -285,34 +285,27 @@ uint16_t** ggml_mpi_split_range(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t start,
     uint16_t end,
-    float node_weights[]
+    const float node_weights[]
 ) {
     // Splits the range given by start and end
     // over the available nodes. This implementation
     // assumes that node 0 handles the final part of the range
     // while node 1 handles the beginning, to form a ring pipeline
 
-    // Only node 0 deals with the device splits, other nodes
-    // get the splits from the scatter layers operation
-
-    if (ctx_mpi->rank != 0) {
-        return NULL;
-    }
-
     uint16_t range_length = end - start + 1;
     uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size);
     for (int i = 0; i < ctx_mpi->size; i++) {
         ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
     }
     uint16_t next_layer = 0;
-    for (int i=1; i < ctx_mpi->size; i++) {
+    for (int i=0; i < ctx_mpi->size; i++) {
         ranges[i][0] = next_layer;
         ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
         next_layer = ranges[i][1];
     }
 
-    ranges[0][0] = next_layer;
-    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
+//    ranges[0][0] = next_layer;
+//    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
     return ranges;
 
 }
@@ -775,8 +768,13 @@ GGML_CALL static void ggml_backend_mpi_buffer_free_buffer(ggml_backend_buffer_t
 
 GGML_CALL static void ggml_backend_mpi_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+
+    if (ggml_backend_mpi_buffer_rank(buffer) != ggml_backend_mpi_buffer_local_rank(buffer)) {
+        return;
+    }
+
 //    fprintf(stderr, "SETTING TENSOR WITHOUT MPI CALLS FOR %s (%s) AND TGT BUFFER %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer));
-    return ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
+    ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
 }
 
 GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -794,8 +792,12 @@ GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t b
 }
 
 GGML_CALL static bool ggml_backend_mpi_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_mpi_buffer_rank(src->buffer) == ggml_backend_mpi_buffer_rank(dst->buffer)) {
+        return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src,
+                                                                        dst);
+    }
 
-    return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src, dst);
+    return true;
 }
 
 GGML_CALL static void ggml_backend_mpi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -849,25 +851,25 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
 }
 
 bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
-    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
-
-    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
-
-    if (ctx->remote) {
-        return true;
-    }
-
-    if (src_rank == dst_rank) {
-//        src->buffer->iface.cpy_tensor(src->buffer, src, dst);
-        return true;
-    }
-
-    if (src_rank == ctx->rank) {
-        ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
-    } else if (dst_rank == ctx->rank){
-        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
-    }
+//    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
+//    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
+//
+//    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+//
+//    if (ctx->remote) {
+//        return true;
+//    }
+//
+//    if (src_rank == dst_rank) {
+////        src->buffer->iface.cpy_tensor(src->buffer, src, dst);
+//        return true;
+//    }
+//
+//    if (src_rank == ggml_backend_mpi_local_rank(backend)) {
+//        ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
+//    } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
+//        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
+//    }
     return true;
 
 }
diff --git a/ggml-mpi.h b/ggml-mpi.h
index b4f616d60..fe8358f2d 100644
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@@ -202,7 +202,7 @@ uint16_t** ggml_mpi_split_range(
     struct ggml_mpi_context * ctx_mpi,
     uint16_t start,
     uint16_t end,
-    float node_weights[]
+    const float node_weights[]
 );
 
 // BACKEND V2
diff --git a/ggml.h b/ggml.h
index 3544e9d6a..e4aabab05 100644
--- a/ggml.h
+++ b/ggml.h
@@ -226,7 +226,7 @@
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       128
+#define GGML_MAX_CONTEXTS       256
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
diff --git a/llama.cpp b/llama.cpp
index 9e0343cad..96b2adbbe 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2001,6 +2001,10 @@ struct llama_model {
     int64_t t_load_us = 0;
     int64_t t_start_us = 0;
 
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = nullptr;
+#endif
+
     ~llama_model() {
         for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
@@ -2099,9 +2103,7 @@ struct llama_context {
     struct ggml_tensor * inp_s_mask;    // F32 [1, kv_size]
     struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]
 
-#ifdef GGML_USE_MPI
-    ggml_mpi_context * ctx_mpi = NULL;
-#endif
+
 };
 
 //
@@ -3277,6 +3279,11 @@ static void llm_load_hparams(
     auto & hparams = model.hparams;
     const gguf_context * ctx = ml.ctx_gguf;
 
+#ifdef GGML_USE_MPI
+    model.ctx_mpi = ggml_mpi_init();
+
+#endif
+
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
         enum gguf_type type = gguf_get_kv_type(ctx, i);
@@ -4008,6 +4015,7 @@ static bool llm_load_tensors(
         enum llama_split_mode split_mode,
         int main_gpu,
         const float * tensor_split,
+        const float * node_split,
         bool use_mlock,
         llama_progress_callback progress_callback,
         void * progress_callback_user_data) {
@@ -4097,11 +4105,17 @@ static bool llm_load_tensors(
     }
 
 #ifdef GGML_USE_MPI
-    // TESTING: Setting all non-input/output layers to node 1
-    for (int64_t i = 0; i < n_layer; i++) {
-        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1);
-        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1);
+    uint16_t** ranges = ggml_mpi_split_range(model.ctx_mpi, 0, n_layer - 1, node_split);
 
+
+    size_t size = ggml_mpi_size(model.ctx_mpi);
+
+    for (size_t i = 0; i < size; i++) {
+        for (uint16_t j = ranges[i][0]; j < ranges[i][1]; j++) {
+            printf("Setting buffer rank for i %zu and j %d\n", i, j);
+            ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft, (int)i);
+            ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft_matrix, (int)i);
+        }
     }
 
 
@@ -5101,7 +5115,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 #endif
 
         if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
+            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.node_layer_weights, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
         )) {
             return -2;
@@ -8813,7 +8827,7 @@ static int llama_decode_internal(
     uint32_t n_tokens_all = batch_all.n_tokens;
 
 #ifdef GGML_USE_MPI
-    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
+    ggml_mpi_eval_init(lctx.model.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
     n_tokens_all = batch_all.n_tokens;
 #endif
 
@@ -9003,7 +9017,7 @@ static int llama_decode_internal(
         //       update the graphs to skip "result_output" if logits are not needed
         if (res) {
     #ifdef GGML_USE_MPI
-        if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+        if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
 #endif
 
         ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
@@ -12636,7 +12650,7 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
-            static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
+            static_cast<float *>(calloc(1, sizeof(float))),
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -12706,7 +12720,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 
 int llama_node_id(struct llama_context * ctx) {
 #ifdef GGML_USE_MPI
-    return ggml_mpi_rank(ctx->ctx_mpi);
+    return ggml_mpi_rank(ctx->model.ctx_mpi);
 
 #endif
     return 0;
@@ -13026,8 +13040,13 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef GGML_USE_MPI
 
+        std::vector<ggml_backend_t> new_backends;
 
-        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)};
+        for (size_t i = 0; i < ggml_mpi_size(model->ctx_mpi); i++) {
+            new_backends.push_back(ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), (int) i));
+        }
+
+        ctx->backends = new_backends;
 
 
 
@@ -13144,23 +13163,13 @@ struct llama_context * llama_new_context_with_model(
         }
     }
 
-#ifdef GGML_USE_MPI
-    ctx->ctx_mpi = ggml_mpi_init();
 
-#endif
 
     return ctx;
 }
 
 void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
-//#ifdef GGML_USE_MPI
-//    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
-//        GGML_ASSERT(false && "Must have same number of split percentages as devices");
-//    }
-//    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
-//    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
-//    free(ranges);
-//#endif
+
 }
 
 void llama_free(struct llama_context * ctx) {
@@ -13998,7 +14007,7 @@ int32_t llama_decode(
           struct llama_batch   batch) {
 
 #ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
         const int n_ctx = llama_n_ctx(ctx);
         std::vector<llama_token> tmp(n_ctx, llama_token_bos(&ctx->model));
diff --git a/llama.h b/llama.h
index 2f2e775ca..48ac9a324 100644
--- a/llama.h
+++ b/llama.h
@@ -203,7 +203,7 @@ extern "C" {
 
     struct llama_model_params {
         // Array of layers to allocate to each node
-        int32_t* n_node_layers;
+        const float * node_layer_weights;
 
         int32_t n_gpu_layers; // number of layers to store in VRAM
         enum llama_split_mode split_mode; // how to split the model across multiple GPUs