From 968cefb4a9c430c09b2b7a4df9dbc24b74efe593 Mon Sep 17 00:00:00 2001 From: Branden Butler Date: Mon, 19 Feb 2024 12:21:48 -0600 Subject: [PATCH] Wrap backends with MPI backend --- ggml-mpi.cpp | 124 +++++++++++++++++++-------------------------------- ggml-mpi.h | 9 ++-- llama.cpp | 50 +++++++++------------ llama.h | 3 ++ 4 files changed, 74 insertions(+), 112 deletions(-) diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp index b43dd96d1..3d2fc829e 100644 --- a/ggml-mpi.cpp +++ b/ggml-mpi.cpp @@ -22,6 +22,7 @@ struct ggml_mpi_context { int layer_end; struct ggml_tensor *inp0; std::string name; + struct ggml_backend * wrapped_backend; }; void ggml_mpi_backend_init(void) { @@ -247,8 +248,6 @@ void ggml_mpi_scatter_layers( } void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int n_layers) { - const int mpi_rank = ctx_mpi->rank; - const int mpi_size = ctx_mpi->size; struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens"); if (inp_tokens == NULL) { @@ -286,73 +285,22 @@ void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml } - { - - - //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size; - - const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1; - - //const int il0 = (mpi_idx + 0) * n_per_node; - //const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node); - int il0 = ctx_mpi->layer_start; - int il1 = MIN(n_layers, ctx_mpi->layer_end); - - char name_l0[GGML_MAX_NAME]; - char name_l1[GGML_MAX_NAME]; - - snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0); - snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1); - - const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0); - const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes; - - if (idx_l0 < 0 || idx_l1 < 0) { - fprintf(stderr, "%s: layer input nodes not found\n", __func__); - return; - } - - // attach the input data to all nodes that need it - // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below) - for (int i = idx_l0; i < idx_l1; i++) { - if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) { - gf->nodes[i]->src[0] = inp0; - } - if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) { - gf->nodes[i]->src[1] = inp0; - } - } - - // TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph - for (int i = 1; i < idx_l1 - idx_l0; i++) { - gf->nodes[i] = gf->nodes[idx_l0 + i]; - } - - // the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node - if (mpi_idx != 0) { - gf->nodes[0]->op = GGML_OP_NONE; - } - - gf->n_nodes = idx_l1 - idx_l0; - - } } // TODO: there are many improvements that can be done to this implementation void ggml_mpi_graph_compute_pre( struct ggml_mpi_context * ctx_mpi, - struct ggml_cgraph * gf, - int n_layers) { + struct ggml_cgraph * gf) { const int mpi_rank = ctx_mpi->rank; const int mpi_size = ctx_mpi->size; - struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens"); + struct ggml_tensor * inp_tokens = gf->nodes[0]; if (inp_tokens == NULL) { fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__); return; } - struct ggml_tensor * inp0 = ctx_mpi->inp0; + struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0"); if (inp0 == NULL) { fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__); return; @@ -381,9 +329,7 @@ void ggml_mpi_graph_compute_pre( void ggml_mpi_graph_compute_post( struct ggml_mpi_context * ctx_mpi, - struct ggml_cgraph * gf, - int n_layers) { - UNUSED(n_layers); + struct ggml_cgraph * gf) { const int mpi_rank = ctx_mpi->rank; const int mpi_size = ctx_mpi->size; @@ -396,9 +342,24 @@ void ggml_mpi_graph_compute_post( // BACKEND V2 +GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + + struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context; + + ggml_mpi_graph_compute_pre(ctx, cgraph); + + ggml_backend_t wrapped_backend = ctx->wrapped_backend; + bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph); + + ggml_mpi_graph_compute_post(ctx, cgraph); + + return ret; +} + + static const char * ggml_backend_mpi_name(ggml_backend_t backend) { auto * ctx = static_cast(backend->context); - return ctx->name.c_str(); + return ctx->wrapped_backend->iface.get_name(backend); } static void ggml_backend_mpi_free(ggml_backend_t backend) { @@ -427,20 +388,6 @@ GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const GGML_UNUSED(backend); } -static struct ggml_backend_i mpi_backend_i = { - /* .get_name = */ ggml_backend_mpi_name, - /* .free = */ ggml_backend_mpi_free, - /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_graph_compute, - /* .supports_op = */ ggml_backend_mpi_supports_op, -}; std::vector ggml_mpi_available_devices_internal() { @@ -473,23 +420,42 @@ ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type return ggml_backend_wrapped_buffer_type; } -ggml_backend_t ggml_backend_mpi_init(int index) { +ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend) { + + struct ggml_backend_i mpi_backend_i = { + /* .get_name = */ wrapped_backend->iface.get_name, + /* .free = */ ggml_backend_mpi_free, + /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_mpi_graph_compute, + /* .supports_op = */ ggml_backend_mpi_supports_op, + }; + + ggml_mpi_context * ctx = ggml_mpi_init(); + ctx->wrapped_backend = wrapped_backend; auto *mpi_backend = new ggml_backend { /* .interface = */ mpi_backend_i, - /* .context = */ ggml_mpi_init(), + /* .context = */ ctx, }; return mpi_backend; } static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) { + // TODO check what the parameters are for. Could use it to setup the MPI comms and routes? GGML_UNUSED(params); - return ggml_backend_mpi_init(intptr_t(user_data)); + return ggml_backend_mpi_init(ggml_backend_cpu_init()); } -ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) { +ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type() { return ggml_backend_cpu_buffer_type(); } @@ -501,7 +467,7 @@ int ggml_backend_mpi_reg_devices() { ggml_backend_register( device.name, ggml_backend_reg_mpi_init, - ggml_backend_mpi_buffer_type(device.index), + ggml_backend_mpi_buffer_type(), reinterpret_cast(intptr_t(device.index)) ); } diff --git a/ggml-mpi.h b/ggml-mpi.h index 2a0c5809c..c72ec0444 100644 --- a/ggml-mpi.h +++ b/ggml-mpi.h @@ -53,7 +53,6 @@ struct ggml_mpi_context * ggml_mpi_init(void); void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int n_layers); -GGML_API ggml_backend_t ggml_backend_mpi_init(int index); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft); /** @@ -185,8 +184,7 @@ void ggml_mpi_scatter_layers( */ void ggml_mpi_graph_compute_pre( struct ggml_mpi_context * ctx_mpi, - struct ggml_cgraph * gf, - int n_layers); + struct ggml_cgraph * gf); /** * Sends the output tensor to the next node for processing @@ -198,8 +196,7 @@ void ggml_mpi_graph_compute_pre( */ void ggml_mpi_graph_compute_post( struct ggml_mpi_context * ctx_mpi, - struct ggml_cgraph * gf, - int n_layers); + struct ggml_cgraph * gf); // BACKEND V2 @@ -213,6 +210,8 @@ struct ggml_mpi_device { #define MPI_BACKEND_NAME "MPI" GGML_CALL int ggml_backend_mpi_reg_devices(); +GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend); + #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index 444c99e58..edf2a03cf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4090,15 +4090,15 @@ static bool llm_load_tensors( } #ifdef GGML_USE_MPI - for (int64_t i = 0; i < n_layer; i++) { - model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix), - ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)}; - } - - model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix), - ggml_backend_mpi_wrap_buffer(model.buft_input.buft)}; - model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix), - ggml_backend_mpi_wrap_buffer(model.buft_output.buft)}; +// for (int64_t i = 0; i < n_layer; i++) { +// model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix), +// ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)}; +// } +// +// model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix), +// ggml_backend_mpi_wrap_buffer(model.buft_input.buft)}; +// model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix), +// ggml_backend_mpi_wrap_buffer(model.buft_output.buft)}; #endif // count used buffer types @@ -8764,10 +8764,7 @@ static void llama_graph_compute( llama_context & lctx, ggml_cgraph * gf, int n_threads) { -#ifdef GGML_USE_MPI - const int64_t n_layer = lctx.model.hparams.n_layer; - ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); -#endif + #ifdef GGML_USE_METAL if (ggml_backend_is_metal(lctx.backend_metal)) { @@ -8783,10 +8780,7 @@ static void llama_graph_compute( ggml_backend_sched_graph_compute_async(lctx.sched, gf); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); - -#ifdef GGML_USE_MPI - ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); -#endif + } // decode a batch of tokens by evaluating the transformer @@ -12619,6 +12613,7 @@ static int llama_apply_lora_from_file_internal( // struct llama_model_params llama_model_default_params() { struct llama_model_params result = { + static_cast(calloc(1, sizeof(int32_t))), /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, @@ -12998,18 +12993,7 @@ struct llama_context * llama_new_context_with_model( } #endif -#ifdef GGML_USE_MPI - // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used - ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - -#endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); @@ -13018,6 +13002,16 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_cpu); +#ifdef GGML_USE_MPI + + for(auto & backend : ctx->backends) { + backend = ggml_backend_mpi_init(backend); + + } + + ctx->backend_cpu = ctx->backends.back(); +#endif + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); diff --git a/llama.h b/llama.h index 818056064..2f2e775ca 100644 --- a/llama.h +++ b/llama.h @@ -202,6 +202,9 @@ extern "C" { }; struct llama_model_params { + // Array of layers to allocate to each node + int32_t* n_node_layers; + int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs