From b7599f7a563d567447e0c991e1dc1c84eb002e38 Mon Sep 17 00:00:00 2001 From: Branden Butler Date: Tue, 31 Oct 2023 15:55:15 -0500 Subject: [PATCH] Fix some mpi mem leaks, add mpi-layer-split to help when using mpi --- common/common.cpp | 3 +++ ggml-mpi.c | 52 +++++++++++++++++++++++++++++------------------ ggml-mpi.h | 3 ++- llama.cpp | 1 + 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c58477fd6..d924c80dc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1094,6 +1094,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); } +#ifdef GGML_USE_MPI + printf(" --mpi-layer-split N percentiles to split the layers by across nodes\n"); +#endif printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); diff --git a/ggml-mpi.c b/ggml-mpi.c index 1e4d0b376..fd88eab1f 100644 --- a/ggml-mpi.c +++ b/ggml-mpi.c @@ -47,7 +47,7 @@ struct ggml_mpi_context * ggml_mpi_split_comm(struct ggml_mpi_context * ctx, int } void ggml_mpi_free(struct ggml_mpi_context * ctx) { - MPI_Comm_free(ctx->comm); + MPI_Comm_free(&(ctx->comm)); free(ctx); } @@ -55,7 +55,7 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx) { return ctx->rank; } -int ggml_mpi_size(struct ggml_mpi_context * ctx) { +size_t ggml_mpi_size(struct ggml_mpi_context * ctx) { return ctx->size; } @@ -69,30 +69,41 @@ void ggml_mpi_eval_init( MPI_Barrier(ctx_mpi->comm); - + int32_t old_n_tokens = *n_tokens; MPI_Bcast(n_tokens, 1, MPI_INT, 0, ctx_mpi->comm); - if (ctx_mpi->rank != 0) { - *pos = calloc(*n_tokens, sizeof(int32_t)); - *n_seq_ids = calloc(*n_tokens, sizeof(int32_t)); - *logits = calloc(*n_tokens, sizeof(int8_t)); + // If what was passed in differs from what was broadcast, + // we can't guarantee the allocated sizes are correct + // TODO check how often this is done and if it's a problem, + // try to allocate ahead of time + if (old_n_tokens != *n_tokens) { + *pos = realloc(*pos, *n_tokens * sizeof(int32_t)); + *n_seq_ids = realloc(*n_seq_ids, *n_tokens * sizeof(int32_t )); + *logits = realloc(*logits, *n_tokens * sizeof(int32_t)); } + + +// MPI_Bcast(&total_n_seq_ids, 1, MPI_INT32_T, 0, ctx_mpi->comm); + MPI_Bcast(*n_seq_ids, *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm); + + // We need to know the total number of sequence + // ids, so we count them all up int32_t total_n_seq_ids = 0; - for (size_t i = 0; i < *n_tokens; i++) { + for (int32_t i = 0; i < *n_tokens; i++) { total_n_seq_ids += (*n_seq_ids)[i]; } - MPI_Bcast(&total_n_seq_ids, 1, MPI_INT32_T, 0, ctx_mpi->comm); - MPI_Bcast(*n_seq_ids, *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm); - + // MPI can't chase the pointers for multidimensional arrays, so we flatten them first + // for transit int32_t * flattened_seq_ids = calloc(total_n_seq_ids, sizeof(int32_t)); int32_t current_index = 0; + // Only rank 0 needs to flatten since the others don't have the real seq_id if (ctx_mpi->rank == 0) { - for (size_t i = 0; i < *n_tokens; i++) { - for (size_t j = 0; j < (*n_seq_ids)[i]; j++) { + for (int32_t i = 0; i < *n_tokens; i++) { + for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) { flattened_seq_ids[current_index] = (*seq_id)[i][j]; current_index++; } @@ -100,25 +111,26 @@ void ggml_mpi_eval_init( } - MPI_Bcast(*pos, *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm); - MPI_Bcast(flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm); + MPI_Bcast( *pos, *n_tokens, MPI_INT32_T, 0, ctx_mpi->comm); + MPI_Bcast(flattened_seq_ids, total_n_seq_ids, MPI_INT32_T, 0, ctx_mpi->comm); //MPI_Bcast(*logits, *n_tokens, MPI_INT8_T, 0, ctx_mpi->comm); int32_t ** new_seq_id = calloc(*n_tokens, sizeof(int32_t*)); current_index = 0; - for (size_t i = 0; i < *n_tokens; i++) { + for (int32_t i = 0; i < *n_tokens; i++) { new_seq_id[i] = calloc((*n_seq_ids)[i], sizeof(int32_t)); - for (size_t j = 0; j < (*n_seq_ids)[i]; j++) { + for (int32_t j = 0; j < (*n_seq_ids)[i]; j++) { new_seq_id[i][j] = flattened_seq_ids[current_index]; current_index++; } } free(flattened_seq_ids); + //free(*seq_id); // <- something is still holding onto this, need to investigate *seq_id = new_seq_id; } void ggml_mpi_synch_int( - struct ggml_mpi_context * ctx_mpi, - int32_t * val + struct ggml_mpi_context * ctx_mpi, + int32_t * val ) { MPI_Bcast(val, 1, MPI_INT32_T, 0, ctx_mpi->comm); } @@ -284,7 +296,7 @@ void ggml_mpi_graph_compute_pre( { - const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size; + //const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size; const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1; diff --git a/ggml-mpi.h b/ggml-mpi.h index f3c4bf2aa..62b15faef 100644 --- a/ggml-mpi.h +++ b/ggml-mpi.h @@ -1,5 +1,6 @@ #pragma once #include +#include struct ggml_context; struct ggml_tensor; @@ -98,7 +99,7 @@ int ggml_mpi_rank(struct ggml_mpi_context * ctx); * @param ctx The context containing the communicator used for this size check. * @return The number of nodes that are a part of the given context's communicator. */ -int ggml_mpi_size(struct ggml_mpi_context * ctx); +size_t ggml_mpi_size(struct ggml_mpi_context * ctx); /** * Synchronize needed information among the nodes diff --git a/llama.cpp b/llama.cpp index a5f56b552..d1f356504 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13094,6 +13094,7 @@ void llama_split_layers_weighted(struct llama_context * ctx, float device_weight } uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights); ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges); + free(ranges); #endif }