From b98274c76fa5c1b534237fd5327aae4a7435b0c5 Mon Sep 17 00:00:00 2001 From: Branden Butler Date: Mon, 5 Feb 2024 17:19:45 -0600 Subject: [PATCH] Begin transition to backend v2 --- CMakeLists.txt | 2 +- Makefile | 2 +- ggml-mpi.c => ggml-mpi.cpp | 210 +++++++++++++++++++++++++++++++------ ggml-mpi.h | 19 ++++ ggml.h | 1 + llama.cpp | 31 ++++++ 6 files changed, 233 insertions(+), 32 deletions(-) rename ggml-mpi.c => ggml-mpi.cpp (67%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ac2804a6..b805650d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -426,7 +426,7 @@ if (LLAMA_MPI) message(STATUS "MPI found") set(GGML_HEADERS_MPI ggml-mpi.h) - set(GGML_SOURCES_MPI ggml-mpi.c) + set(GGML_SOURCES_MPI ggml-mpi.cpp) add_compile_definitions(GGML_USE_MPI) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) diff --git a/Makefile b/Makefile index cb597b209..f8ad9f146 100644 --- a/Makefile +++ b/Makefile @@ -573,7 +573,7 @@ endif endif # LLAMA_METAL ifdef LLAMA_MPI -ggml-mpi.o: ggml-mpi.c ggml-mpi.h +ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI diff --git a/ggml-mpi.c b/ggml-mpi.cpp similarity index 67% rename from ggml-mpi.c rename to ggml-mpi.cpp index c10faa252..b43dd96d1 100644 --- a/ggml-mpi.c +++ b/ggml-mpi.cpp @@ -1,11 +1,14 @@ #include "ggml-mpi.h" #include "ggml.h" +#include "ggml-backend.h" +#include "ggml-backend-impl.h" #include #include #include +#include #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -17,6 +20,8 @@ struct ggml_mpi_context { MPI_Comm comm; int layer_start; int layer_end; + struct ggml_tensor *inp0; + std::string name; }; void ggml_mpi_backend_init(void) { @@ -29,7 +34,7 @@ void ggml_mpi_backend_free(void) { } struct ggml_mpi_context * ggml_mpi_init(void) { - struct ggml_mpi_context * ctx = calloc(1, sizeof(struct ggml_mpi_context)); + auto * ctx = static_cast(calloc(1, sizeof(struct ggml_mpi_context))); MPI_Comm_rank(MPI_COMM_WORLD, &ctx->rank); MPI_Comm_size(MPI_COMM_WORLD, &ctx->size); @@ -39,7 +44,7 @@ struct ggml_mpi_context * ggml_mpi_init(void) { } struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int color, int key) { - struct ggml_mpi_context * newCtx = calloc(1, sizeof(struct ggml_mpi_context)); + auto * newCtx = static_cast(calloc(1, sizeof(struct ggml_mpi_context))); MPI_Comm_split(ctx->comm, color, key, &newCtx->comm); MPI_Comm_rank(newCtx->comm, &newCtx->rank); MPI_Comm_size(newCtx->comm, &newCtx->size); @@ -70,16 +75,16 @@ void ggml_mpi_eval_init( MPI_Barrier(ctx_mpi->comm); int32_t old_n_tokens = *n_tokens; - MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm); + MPI_Bcast(n_tokens, 1, MPI_INT32_T, 0, ctx_mpi->comm); // If what was passed in differs from what was broadcast, // we can't guarantee the allocated sizes are correct // TODO check how often this is done and if it's a problem, // try to allocate ahead of time if (old_n_tokens != *n_tokens) { - *pos = realloc(*pos, *n_tokens * sizeof(int32_t)); - *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t )); - *logits = realloc(*logits, *n_tokens * sizeof(int32_t)); + *pos = static_cast(realloc(*pos, *n_tokens * sizeof(int32_t))); + *n_seq_ids = static_cast(realloc(*n_seq_ids, *n_tokens * sizeof(int32_t))); + *logits = static_cast(realloc(*logits, *n_tokens * sizeof(int32_t))); } @@ -96,7 +101,7 @@ void ggml_mpi_eval_init( // MPI can't chase the pointers for multidimensional arrays, so we flatten them first // for transit - int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t)); + auto * flattened_seq_ids = static_cast(calloc(total_n_seq_ids, sizeof(int32_t))); int32_t current_index = 0; @@ -114,10 +119,10 @@ void ggml_mpi_eval_init( MPI_Bcast( *pos, *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm); MPI_Bcast(flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm); //MPI_Bcast(*logits, *n_tokens, MPI_INT8_T, 0, ctx_mpi->comm); - int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*)); + auto ** new_seq_id = static_cast(calloc(*n_tokens, sizeof(int32_t *))); current_index = 0; for (int32_t i = 0; i < *n_tokens; i++) { - new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t)); + new_seq_id[i] = static_cast(calloc((*n_seq_ids)[i], sizeof(int32_t))); for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) { new_seq_id[i][j] = flattened_seq_ids[current_index]; current_index++; @@ -176,7 +181,7 @@ static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src, MPI_C } MPI_Status status; UNUSED(status); - + fprintf(stderr, "%s: tensor receive == null: %d\n", __func__, t->data == NULL); const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, comm, &status); GGML_ASSERT(retval == MPI_SUCCESS); } @@ -241,11 +246,7 @@ void ggml_mpi_scatter_layers( fprintf(stderr, "Ranges for rank %d: [%d, %d]\n", ctx_mpi->rank, ctx_mpi->layer_start, ctx_mpi->layer_end); } -// TODO: there are many improvements that can be done to this implementation -void ggml_mpi_graph_compute_pre( - struct ggml_mpi_context * ctx_mpi, - struct ggml_cgraph * gf, - int n_layers) { +void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int n_layers) { const int mpi_rank = ctx_mpi->rank; const int mpi_size = ctx_mpi->size; @@ -261,6 +262,8 @@ void ggml_mpi_graph_compute_pre( return; } + ctx_mpi->inp0 = inp0; + // fprintf(stderr, "gf->nodes[0] == %s\n", ggml_get_name(gf->nodes[0])); // // GGML_ASSERT(inp0 == gf->nodes[0]); @@ -278,23 +281,11 @@ void ggml_mpi_graph_compute_pre( // - - if (mpi_rank > 0) { - if (mpi_rank == 1) { - // the first node (1) receives the input tokens from the main node (0) - ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm); - } else { - // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph) - ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm); - } - } else if (mpi_size > 1) { - // node 0 sends the input tokens to node 1 - ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm); - - // recv the output data from the last node - ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm); + for (int i = 0; i < gf->n_nodes; i++) { + gf->nodes[i]->backend = GGML_BACKEND_MPI_SPLIT; } + { @@ -347,6 +338,47 @@ void ggml_mpi_graph_compute_pre( } } +// TODO: there are many improvements that can be done to this implementation +void ggml_mpi_graph_compute_pre( + struct ggml_mpi_context * ctx_mpi, + struct ggml_cgraph * gf, + int n_layers) { + const int mpi_rank = ctx_mpi->rank; + const int mpi_size = ctx_mpi->size; + + struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens"); + if (inp_tokens == NULL) { + fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__); + return; + } + + struct ggml_tensor * inp0 = ctx_mpi->inp0; + if (inp0 == NULL) { + fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__); + return; + } + + if (mpi_rank > 0) { + if (mpi_rank == 1) { + // the first node (1) receives the input tokens from the main node (0) + if (inp_tokens->data == NULL) { + + } + ggml_mpi_tensor_recv(inp_tokens, 0, ctx_mpi->comm); + } else { + // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph) + fprintf(stderr, "%s:%d: receiving layer inp0\n", __func__, ctx_mpi->rank); + ggml_mpi_tensor_recv(inp0, mpi_rank - 1, ctx_mpi->comm); + } + } else if (mpi_size > 1) { + // node 0 sends the input tokens to node 1 + ggml_mpi_tensor_send(inp_tokens, 1, ctx_mpi->comm); + + // recv the output data from the last node + ggml_mpi_tensor_recv(inp0, mpi_size - 1, ctx_mpi->comm); + } +} + void ggml_mpi_graph_compute_post( struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, @@ -361,3 +393,121 @@ void ggml_mpi_graph_compute_post( ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size, ctx_mpi->comm); } } + +// BACKEND V2 + +static const char * ggml_backend_mpi_name(ggml_backend_t backend) { + auto * ctx = static_cast(backend->context); + return ctx->name.c_str(); +} + +static void ggml_backend_mpi_free(ggml_backend_t backend) { + auto * ctx = static_cast(backend->context); + + delete ctx; + + + delete backend; +} + +static ggml_backend_buffer_type_t ggml_backend_mpi_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); +} + +GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_CPY: + return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float + case GGML_OP_MUL_MAT: + return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + default: + return true; + } + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i mpi_backend_i = { + /* .get_name = */ ggml_backend_mpi_name, + /* .free = */ ggml_backend_mpi_free, + /* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_graph_compute, + /* .supports_op = */ ggml_backend_mpi_supports_op, +}; + + +std::vector ggml_mpi_available_devices_internal() { + static bool has_init = false; + if (!has_init) { + ggml_mpi_backend_init(); + has_init = true; + } + std::vector devices; + int s; + MPI_Comm_size(MPI_COMM_WORLD, &s); + devices.resize(s); + for (int i = 0; i < s; i++) { + devices[i] = ggml_mpi_device{ + i, + ggml_mpi_init(), + ("MPI_COMM_WORLD:" + std::to_string(i)).c_str(), + 1 + }; + } + return devices; +} + +ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft) { + auto* ggml_backend_wrapped_buffer_type = new ggml_backend_buffer_type{ + /* .iface = */ buft->iface, + /* .context = */ buft->context, + }; + + return ggml_backend_wrapped_buffer_type; +} + +ggml_backend_t ggml_backend_mpi_init(int index) { + auto *mpi_backend = new ggml_backend { + /* .interface = */ mpi_backend_i, + /* .context = */ ggml_mpi_init(), + }; + + return mpi_backend; +} + +static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) { + GGML_UNUSED(params); + return ggml_backend_mpi_init(intptr_t(user_data)); +} + + + +ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) { + return ggml_backend_cpu_buffer_type(); +} + +extern "C" GGML_CALL int ggml_backend_mpi_reg_devices(); + +int ggml_backend_mpi_reg_devices() { + auto devices = ggml_mpi_available_devices_internal(); + for (const auto & device : devices) { + ggml_backend_register( + device.name, + ggml_backend_reg_mpi_init, + ggml_backend_mpi_buffer_type(device.index), + reinterpret_cast(intptr_t(device.index)) + ); + } + return devices.size(); +} + + + + diff --git a/ggml-mpi.h b/ggml-mpi.h index 62b15faef..2a0c5809c 100644 --- a/ggml-mpi.h +++ b/ggml-mpi.h @@ -1,6 +1,8 @@ #pragma once #include #include +#include "ggml.h" +#include "ggml-backend.h" struct ggml_context; struct ggml_tensor; @@ -49,6 +51,11 @@ void ggml_mpi_backend_free(void); */ struct ggml_mpi_context * ggml_mpi_init(void); +void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int n_layers); + +GGML_API ggml_backend_t ggml_backend_mpi_init(int index); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft); + /** * Create a new context by splitting the given context's * communicator, creating a "sub-communicator." This is a collective @@ -194,6 +201,18 @@ void ggml_mpi_graph_compute_post( struct ggml_cgraph * gf, int n_layers); +// BACKEND V2 + +struct ggml_mpi_device { + int index; + struct ggml_mpi_context * ctx_mpi; + const char * name; + int subgroupSize; +}; + +#define MPI_BACKEND_NAME "MPI" +GGML_CALL int ggml_backend_mpi_reg_devices(); + #ifdef __cplusplus } #endif diff --git a/ggml.h b/ggml.h index ab26c8f59..a4efe792d 100644 --- a/ggml.h +++ b/ggml.h @@ -379,6 +379,7 @@ extern "C" { GGML_BACKEND_TYPE_CPU = 0, GGML_BACKEND_TYPE_GPU = 10, GGML_BACKEND_TYPE_GPU_SPLIT = 20, + GGML_BACKEND_TYPE_MPI_SPLIT = 30, }; // model file types diff --git a/llama.cpp b/llama.cpp index 2bdb38434..444c99e58 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4089,6 +4089,18 @@ static bool llm_load_tensors( } } +#ifdef GGML_USE_MPI + for (int64_t i = 0; i < n_layer; i++) { + model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix), + ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)}; + } + + model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix), + ggml_backend_mpi_wrap_buffer(model.buft_input.buft)}; + model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix), + ggml_backend_mpi_wrap_buffer(model.buft_output.buft)}; +#endif + // count used buffer types std::map buft_layer_count; buft_layer_count[model.buft_input.buft]++; @@ -4965,6 +4977,12 @@ static bool llm_load_tensors( mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } } + +#ifdef GGML_USE_MPI + if (buf == nullptr) { + continue; + } +#endif if (buf == nullptr) { throw std::runtime_error("failed to allocate buffer"); } @@ -12978,6 +12996,19 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#endif + +#ifdef GGML_USE_MPI + // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used + ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + + #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) {