Remove hard-coded layer splits and support more than 2 nodes

2024-03-13 01:38:38 -05:00 · 2024-03-13 01:38:38 -05:00 · 4692644ff9
commit 4692644ff9
parent 5f156f3a0c
6 changed files with 75 additions and 60 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1319,6 +1319,10 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
        mparams.kv_overrides = params.kv_overrides.data();
    }

+    free((void *) mparams.node_layer_weights);
+
+    mparams.node_layer_weights = params.mpi_layer_split.data();
+
    return mparams;
 }

--- a/ggml-mpi.cpp
+++ b/ggml-mpi.cpp
@ -285,34 +285,27 @@ uint16_t** ggml_mpi_split_range(
    struct ggml_mpi_context * ctx_mpi,
    uint16_t start,
    uint16_t end,
-    float node_weights[]
+    const float node_weights[]
 ) {
    // Splits the range given by start and end
    // over the available nodes. This implementation
    // assumes that node 0 handles the final part of the range
    // while node 1 handles the beginning, to form a ring pipeline

-    // Only node 0 deals with the device splits, other nodes
-    // get the splits from the scatter layers operation
-
-    if (ctx_mpi->rank != 0) {
-        return NULL;
-    }
-
    uint16_t range_length = end - start + 1;
    uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size);
    for (int i = 0; i < ctx_mpi->size; i++) {
        ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
    }
    uint16_t next_layer = 0;
-    for (int i=1; i < ctx_mpi->size; i++) {
+    for (int i=0; i < ctx_mpi->size; i++) {
        ranges[i][0] = next_layer;
        ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
        next_layer = ranges[i][1];
    }

-    ranges[0][0] = next_layer;
-    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
+//    ranges[0][0] = next_layer;
+//    ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
    return ranges;

 }
@ -775,8 +768,13 @@ GGML_CALL static void ggml_backend_mpi_buffer_free_buffer(ggml_backend_buffer_t

 GGML_CALL static void ggml_backend_mpi_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
+
+    if (ggml_backend_mpi_buffer_rank(buffer) != ggml_backend_mpi_buffer_local_rank(buffer)) {
+        return;
+    }
+
 //    fprintf(stderr, "SETTING TENSOR WITHOUT MPI CALLS FOR %s (%s) AND TGT BUFFER %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer));
-    return ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
+    ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
 }

 GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@ -794,8 +792,12 @@ GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t b
 }

 GGML_CALL static bool ggml_backend_mpi_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    if (ggml_backend_mpi_buffer_rank(src->buffer) == ggml_backend_mpi_buffer_rank(dst->buffer)) {
+        return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src,
+                                                                        dst);
+    }

-    return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src, dst);
+    return true;
 }

 GGML_CALL static void ggml_backend_mpi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -849,25 +851,25 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
 }

 bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
-    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
-    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
-
-    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
-
-    if (ctx->remote) {
-        return true;
-    }
-
-    if (src_rank == dst_rank) {
-//        src->buffer->iface.cpy_tensor(src->buffer, src, dst);
-        return true;
-    }
-
-    if (src_rank == ctx->rank) {
-        ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
-    } else if (dst_rank == ctx->rank){
-        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
-    }
+//    int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
+//    int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
+//
+//    auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
+//
+//    if (ctx->remote) {
+//        return true;
+//    }
+//
+//    if (src_rank == dst_rank) {
+////        src->buffer->iface.cpy_tensor(src->buffer, src, dst);
+//        return true;
+//    }
+//
+//    if (src_rank == ggml_backend_mpi_local_rank(backend)) {
+//        ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
+//    } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
+//        ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
+//    }
    return true;

 }
--- a/ggml-mpi.h
+++ b/ggml-mpi.h
@ -202,7 +202,7 @@ uint16_t** ggml_mpi_split_range(
    struct ggml_mpi_context * ctx_mpi,
    uint16_t start,
    uint16_t end,
-    float node_weights[]
+    const float node_weights[]
 );

 // BACKEND V2
--- a/ggml.h
+++ b/ggml.h
@ -226,7 +226,7 @@

 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       128
+#define GGML_MAX_CONTEXTS       256
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
--- a/llama.cpp
+++ b/llama.cpp
@ -2001,6 +2001,10 @@ struct llama_model {
    int64_t t_load_us = 0;
    int64_t t_start_us = 0;

+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = nullptr;
+#endif
+
    ~llama_model() {
        for (struct ggml_context * ctx : ctxs) {
            ggml_free(ctx);
@ -2099,9 +2103,7 @@ struct llama_context {
    struct ggml_tensor * inp_s_mask;    // F32 [1, kv_size]
    struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]

-#ifdef GGML_USE_MPI
-    ggml_mpi_context * ctx_mpi = NULL;
-#endif
+
 };

 //
@ -3277,6 +3279,11 @@ static void llm_load_hparams(
    auto & hparams = model.hparams;
    const gguf_context * ctx = ml.ctx_gguf;

+#ifdef GGML_USE_MPI
+    model.ctx_mpi = ggml_mpi_init();
+
+#endif
+
    // get metadata as string
    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
        enum gguf_type type = gguf_get_kv_type(ctx, i);
@ -4008,6 +4015,7 @@ static bool llm_load_tensors(
        enum llama_split_mode split_mode,
        int main_gpu,
        const float * tensor_split,
+        const float * node_split,
        bool use_mlock,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {
@ -4097,11 +4105,17 @@ static bool llm_load_tensors(
    }

 #ifdef GGML_USE_MPI
-    // TESTING: Setting all non-input/output layers to node 1
-    for (int64_t i = 0; i < n_layer; i++) {
-        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1);
-        ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1);
+    uint16_t** ranges = ggml_mpi_split_range(model.ctx_mpi, 0, n_layer - 1, node_split);

+
+    size_t size = ggml_mpi_size(model.ctx_mpi);
+
+    for (size_t i = 0; i < size; i++) {
+        for (uint16_t j = ranges[i][0]; j < ranges[i][1]; j++) {
+            printf("Setting buffer rank for i %zu and j %d\n", i, j);
+            ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft, (int)i);
+            ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft_matrix, (int)i);
+        }
    }


@ -5101,7 +5115,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 #endif

        if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
+            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.node_layer_weights, params.use_mlock,
            params.progress_callback, params.progress_callback_user_data
        )) {
            return -2;
@ -8813,7 +8827,7 @@ static int llama_decode_internal(
    uint32_t n_tokens_all = batch_all.n_tokens;

 #ifdef GGML_USE_MPI
-    ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
+    ggml_mpi_eval_init(lctx.model.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
    n_tokens_all = batch_all.n_tokens;
 #endif

@ -9003,7 +9017,7 @@ static int llama_decode_internal(
        //       update the graphs to skip "result_output" if logits are not needed
        if (res) {
    #ifdef GGML_USE_MPI
-        if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
+        if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
 #endif

        ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
@ -12636,7 +12650,7 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
    struct llama_model_params result = {
-            static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
+            static_cast<float *>(calloc(1, sizeof(float))),
        /*.n_gpu_layers                =*/ 0,
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
@ -12706,7 +12720,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {

 int llama_node_id(struct llama_context * ctx) {
 #ifdef GGML_USE_MPI
-    return ggml_mpi_rank(ctx->ctx_mpi);
+    return ggml_mpi_rank(ctx->model.ctx_mpi);

 #endif
    return 0;
@ -13026,8 +13040,13 @@ struct llama_context * llama_new_context_with_model(

 #ifdef GGML_USE_MPI

+        std::vector<ggml_backend_t> new_backends;

-        ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)};
+        for (size_t i = 0; i < ggml_mpi_size(model->ctx_mpi); i++) {
+            new_backends.push_back(ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), (int) i));
+        }
+
+        ctx->backends = new_backends;



@ -13144,23 +13163,13 @@ struct llama_context * llama_new_context_with_model(
        }
    }

-#ifdef GGML_USE_MPI
-    ctx->ctx_mpi = ggml_mpi_init();

-#endif

    return ctx;
 }

 void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
-//#ifdef GGML_USE_MPI
-//    if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
-//        GGML_ASSERT(false && "Must have same number of split percentages as devices");
-//    }
-//    uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
-//    ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
-//    free(ranges);
-//#endif
+
 }

 void llama_free(struct llama_context * ctx) {
@ -13998,7 +14007,7 @@ int32_t llama_decode(
          struct llama_batch   batch) {

 #ifdef GGML_USE_MPI
-    if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
+    if (ggml_mpi_rank(ctx->model.ctx_mpi) > 0) {
        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
        const int n_ctx = llama_n_ctx(ctx);
        std::vector<llama_token> tmp(n_ctx, llama_token_bos(&ctx->model));
--- a/llama.h
+++ b/llama.h
@ -203,7 +203,7 @@ extern "C" {

    struct llama_model_params {
        // Array of layers to allocate to each node
-        int32_t* n_node_layers;
+        const float * node_layer_weights;

        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs