From 619bf62acf4c79df4bc8b99a99f597f1092c009c Mon Sep 17 00:00:00 2001
From: Branden Butler <bwtbutler@hotmail.com>
Date: Tue, 12 Mar 2024 11:33:33 -0500
Subject: [PATCH] Support new MPI backend in llama.cpp and increase GGML max
 split inputs

---
 ggml.h    |  2 +-
 llama.cpp | 81 ++++++++++++++++++++++++++++++++++---------------------
 2 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/ggml.h b/ggml.h
index a4efe792d..3544e9d6a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -226,7 +226,7 @@
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
+#define GGML_MAX_CONTEXTS       128
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
diff --git a/llama.cpp b/llama.cpp
index f2f052bbf..0c0c783b1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1474,7 +1474,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
     }
 
 #if defined(GGML_USE_MPI)
-    buft = ggml_backend_mpi_wrap_buffer(buft);
+    buft = ggml_backend_mpi_wrap_buffer_type(buft);
 #endif
     return buft;
 
@@ -1528,9 +1528,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
         buft = llama_default_buffer_type_offload(fallback_gpu);
     }
 
-#if defined(GGML_USE_MPI)
-    buft = ggml_backend_mpi_wrap_buffer(buft);
-#endif
 
     return buft;
 
@@ -2177,7 +2174,7 @@ static bool llama_kv_cache_init(
         };
         ggml_context * ctx = ggml_init(params);
         if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
+            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache, n_layers=%d\n", __func__, n_layers);
             return false;
         }
         ctx_map[it.first] = ctx;
@@ -4099,15 +4096,23 @@ static bool llm_load_tensors(
     }
 
 #ifdef GGML_USE_MPI
-//    for (int64_t i = 0; i < n_layer; i++) {
-//        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
-//                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
-//    }
-//
-//    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
-//                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
-//    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
-//                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+    // TESTING: Setting all non-input/output layers to node 1
+    for (int64_t i = 0; i < n_layer; i++) {
+        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1);
+        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1);
+
+    }
+
+
+    // Will run with inputs on other nodes, but output may not be correct.
+    // Default is node 0 anyway, but better to be explicit about it
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_input.buft, 0);
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_input.buft_matrix, 0);
+
+
+    // Outputs *must* be on node 0, otherwise a deadlock occurs
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_output.buft, 0);
+    ggml_backend_mpi_buffer_type_set_rank(model.buft_output.buft_matrix, 0);
 #endif
 
     // count used buffer types
@@ -4968,6 +4973,9 @@ static bool llm_load_tensors(
             size_t first, last;
             ml.get_mapping_range(&first, &last, ctx);
             buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
+#ifdef GGML_USE_MPI
+            buf = ggml_backend_mpi_wrap_buffer(buf);
+#endif
         }
 #ifdef GGML_USE_METAL
         else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
@@ -8784,7 +8792,7 @@ static void llama_graph_compute(
     ggml_backend_sched_graph_compute_async(lctx.sched, gf);
 
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
-    
+
 }
 
 // decode a batch of tokens by evaluating the transformer
@@ -8800,7 +8808,14 @@ static int llama_decode_internal(
          llama_context & lctx,
            llama_batch   batch_all) { // TODO: rename back to batch
 
+
     uint32_t n_tokens_all = batch_all.n_tokens;
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits));
+    n_tokens_all = batch_all.n_tokens;
+#endif
+
     if (n_tokens_all == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
         return -1;
@@ -8900,12 +8915,7 @@ static int llama_decode_internal(
                 kv_self.head = 0;
             }
 
-    #ifdef GGML_USE_MPI
-        // TODO: needs fix after #3228
-        ggml_mpi_eval_init(lctx.ctx_mpi, &(u_batch.n_tokens), &(u_batch.pos), &(u_batch.n_seq_id), &(u_batch.seq_id), &(u_batch.logits));
-        n_tokens = u_batch.n_tokens;
-#endif
-        if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
+            if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
                 return 1;
             }
 
@@ -8991,7 +9001,11 @@ static int llama_decode_internal(
         // TODO: do not compute and extract logits if only embeddings are needed
         //       update the graphs to skip "result_output" if logits are not needed
         if (res) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
+    #ifdef GGML_USE_MPI
+        if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+#endif
+
+        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
             GGML_ASSERT(backend_res != nullptr);
             if (u_batch.logits) {
                 int32_t i_first = -1;
@@ -9092,6 +9106,10 @@ static int llama_decode_internal(
         }
     }
 
+#ifdef GGML_USE_MPI
+    }
+#endif
+
     return 0;
 }
 
@@ -13008,7 +13026,8 @@ struct llama_context * llama_new_context_with_model(
 
 #ifdef GGML_USE_MPI
 
-        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size())};
+
+        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)};
 
 
 
@@ -13134,14 +13153,14 @@ struct llama_context * llama_new_context_with_model(
 }
 
 void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
-#ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
-        GGML_ASSERT(false && "Must have same number of split percentages as devices");
-    }
-    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
-    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
-    free(ranges);
-#endif
+//#ifdef GGML_USE_MPI
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
+//        GGML_ASSERT(false && "Must have same number of split percentages as devices");
+//    }
+//    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
+//    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+//    free(ranges);
+//#endif
 }
 
 void llama_free(struct llama_context * ctx) {