Support new MPI backend in llama.cpp and increase GGML max split inputs

2024-03-12 11:33:33 -05:00 · 2024-03-12 11:33:33 -05:00 · 619bf62acf
commit 619bf62acf
parent 942ce843f8
2 changed files with 51 additions and 32 deletions
--- a/ggml.h
+++ b/ggml.h
@ -226,7 +226,7 @@
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
+#define GGML_MAX_CONTEXTS       128
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
--- a/llama.cpp
+++ b/llama.cpp
@ -1474,7 +1474,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
    }
 #if defined(GGML_USE_MPI)
-    buft = ggml_backend_mpi_wrap_buffer(buft);
+    buft = ggml_backend_mpi_wrap_buffer_type(buft);
 #endif
    return buft;
@ -1528,9 +1528,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
        buft = llama_default_buffer_type_offload(fallback_gpu);
    }
 #if defined(GGML_USE_MPI)
    buft = ggml_backend_mpi_wrap_buffer(buft);
 #endif
    return buft;
@ -2177,7 +2174,7 @@ static bool llama_kv_cache_init(
        };
        ggml_context * ctx = ggml_init(params);
        if (!ctx) {
-            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
+            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache, n_layers=%d\n", __func__, n_layers);
            return false;
        }
        ctx_map[it.first] = ctx;
@ -4099,15 +4096,23 @@ static bool llm_load_tensors(
    }
 #ifdef GGML_USE_MPI
-//    for (int64_t i = 0; i < n_layer; i++) {
+    // TESTING: Setting all non-input/output layers to node 1
-//        model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
+    for (int64_t i = 0; i < n_layer; i++) {
-//                               ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
+        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1);
-//    }
+        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1);
-//
+
-//    model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
+    }
-//                        ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
+
-//    model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
+
-//                         ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
+    // Will run with inputs on other nodes, but output may not be correct.
    // Default is node 0 anyway, but better to be explicit about it
    ggml_backend_mpi_buffer_type_set_rank(model.buft_input.buft, 0);
    ggml_backend_mpi_buffer_type_set_rank(model.buft_input.buft_matrix, 0);
    // Outputs *must* be on node 0, otherwise a deadlock occurs
    ggml_backend_mpi_buffer_type_set_rank(model.buft_output.buft, 0);
    ggml_backend_mpi_buffer_type_set_rank(model.buft_output.buft_matrix, 0);
 #endif
    // count used buffer types
@ -4968,6 +4973,9 @@ static bool llm_load_tensors(
            size_t first, last;
            ml.get_mapping_range(&first, &last, ctx);
            buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
 #ifdef GGML_USE_MPI
            buf = ggml_backend_mpi_wrap_buffer(buf);
 #endif
        }
 #ifdef GGML_USE_METAL
        else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
@ -8800,7 +8808,14 @@ static int llama_decode_internal(
         llama_context & lctx,
           llama_batch   batch_all) { // TODO: rename back to batch
    uint32_t n_tokens_all = batch_all.n_tokens;
 #ifdef GGML_USE_MPI
    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits));
    n_tokens_all = batch_all.n_tokens;
 #endif
    if (n_tokens_all == 0) {
        LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
        return -1;
@ -8900,12 +8915,7 @@ static int llama_decode_internal(
                kv_self.head = 0;
            }
-    #ifdef GGML_USE_MPI
+            if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
        // TODO: needs fix after #3228
        ggml_mpi_eval_init(lctx.ctx_mpi, &(u_batch.n_tokens), &(u_batch.pos), &(u_batch.n_seq_id), &(u_batch.seq_id), &(u_batch.logits));
        n_tokens = u_batch.n_tokens;
 #endif
        if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
                return 1;
            }
@ -8991,7 +9001,11 @@ static int llama_decode_internal(
        // TODO: do not compute and extract logits if only embeddings are needed
        //       update the graphs to skip "result_output" if logits are not needed
        if (res) {
-            ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
+    #ifdef GGML_USE_MPI
        if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
 #endif
        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
            GGML_ASSERT(backend_res != nullptr);
            if (u_batch.logits) {
                int32_t i_first = -1;
@ -9092,6 +9106,10 @@ static int llama_decode_internal(
        }
    }
 #ifdef GGML_USE_MPI
    }
 #endif
    return 0;
 }
@ -13008,7 +13026,8 @@ struct llama_context * llama_new_context_with_model(
 #ifdef GGML_USE_MPI
-        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size())};
+
        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)};
@ -13134,14 +13153,14 @@ struct llama_context * llama_new_context_with_model(
 }
 void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
-#ifdef GGML_USE_MPI
+//#ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
+//    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
-        GGML_ASSERT(false && "Must have same number of split percentages as devices");
+//        GGML_ASSERT(false && "Must have same number of split percentages as devices");
-    }
+//    }
-    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
+//    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
-    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
+//    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
-    free(ranges);
+//    free(ranges);
-#endif
+//#endif
 }
 void llama_free(struct llama_context * ctx) {