From 4692644ff9262c0db9d941c691194ec0e05bcce0 Mon Sep 17 00:00:00 2001 From: Branden Butler Date: Wed, 13 Mar 2024 01:38:38 -0500 Subject: [PATCH] Remove hard-coded layer splits and support more than 2 nodes --- common/common.cpp | 4 +++ ggml-mpi.cpp | 66 ++++++++++++++++++++++++----------------------- ggml-mpi.h | 2 +- ggml.h | 2 +- llama.cpp | 59 ++++++++++++++++++++++++------------------ llama.h | 2 +- 6 files changed, 75 insertions(+), 60 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 46ec366b0..4a8e93cc5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1319,6 +1319,10 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.kv_overrides = params.kv_overrides.data(); } + free((void *) mparams.node_layer_weights); + + mparams.node_layer_weights = params.mpi_layer_split.data(); + return mparams; } diff --git a/ggml-mpi.cpp b/ggml-mpi.cpp index 30e74c8bc..95dcb0fd3 100644 --- a/ggml-mpi.cpp +++ b/ggml-mpi.cpp @@ -285,34 +285,27 @@ uint16_t** ggml_mpi_split_range( struct ggml_mpi_context * ctx_mpi, uint16_t start, uint16_t end, - float node_weights[] + const float node_weights[] ) { // Splits the range given by start and end // over the available nodes. This implementation // assumes that node 0 handles the final part of the range // while node 1 handles the beginning, to form a ring pipeline - // Only node 0 deals with the device splits, other nodes - // get the splits from the scatter layers operation - - if (ctx_mpi->rank != 0) { - return NULL; - } - uint16_t range_length = end - start + 1; uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size); for (int i = 0; i < ctx_mpi->size; i++) { ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2); } uint16_t next_layer = 0; - for (int i=1; i < ctx_mpi->size; i++) { + for (int i=0; i < ctx_mpi->size; i++) { ranges[i][0] = next_layer; ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start); next_layer = ranges[i][1]; } - ranges[0][0] = next_layer; - ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start); +// ranges[0][0] = next_layer; +// ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start); return ranges; } @@ -775,8 +768,13 @@ GGML_CALL static void ggml_backend_mpi_buffer_free_buffer(ggml_backend_buffer_t GGML_CALL static void ggml_backend_mpi_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context; + + if (ggml_backend_mpi_buffer_rank(buffer) != ggml_backend_mpi_buffer_local_rank(buffer)) { + return; + } + // fprintf(stderr, "SETTING TENSOR WITHOUT MPI CALLS FOR %s (%s) AND TGT BUFFER %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer)); - return ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size); + ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size); } GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -794,8 +792,12 @@ GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t b } GGML_CALL static bool ggml_backend_mpi_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_mpi_buffer_rank(src->buffer) == ggml_backend_mpi_buffer_rank(dst->buffer)) { + return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src, + dst); + } - return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src, dst); + return true; } GGML_CALL static void ggml_backend_mpi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -849,25 +851,25 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer } bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) { - int src_rank = ggml_backend_mpi_buffer_rank(src->buffer); - int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer); - - auto * ctx = static_cast(backend->context); - - if (ctx->remote) { - return true; - } - - if (src_rank == dst_rank) { -// src->buffer->iface.cpy_tensor(src->buffer, src, dst); - return true; - } - - if (src_rank == ctx->rank) { - ggml_mpi_tensor_send(src, dst_rank, ctx->comm); - } else if (dst_rank == ctx->rank){ - ggml_mpi_tensor_recv(dst, src_rank, ctx->comm); - } +// int src_rank = ggml_backend_mpi_buffer_rank(src->buffer); +// int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer); +// +// auto * ctx = static_cast(backend->context); +// +// if (ctx->remote) { +// return true; +// } +// +// if (src_rank == dst_rank) { +//// src->buffer->iface.cpy_tensor(src->buffer, src, dst); +// return true; +// } +// +// if (src_rank == ggml_backend_mpi_local_rank(backend)) { +// ggml_mpi_tensor_send(src, dst_rank, ctx->comm); +// } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){ +// ggml_mpi_tensor_recv(dst, src_rank, ctx->comm); +// } return true; } diff --git a/ggml-mpi.h b/ggml-mpi.h index b4f616d60..fe8358f2d 100644 --- a/ggml-mpi.h +++ b/ggml-mpi.h @@ -202,7 +202,7 @@ uint16_t** ggml_mpi_split_range( struct ggml_mpi_context * ctx_mpi, uint16_t start, uint16_t end, - float node_weights[] + const float node_weights[] ); // BACKEND V2 diff --git a/ggml.h b/ggml.h index 3544e9d6a..e4aabab05 100644 --- a/ggml.h +++ b/ggml.h @@ -226,7 +226,7 @@ #define GGML_MAX_DIMS 4 #define GGML_MAX_PARAMS 2048 -#define GGML_MAX_CONTEXTS 128 +#define GGML_MAX_CONTEXTS 256 #define GGML_MAX_SRC 10 #ifndef GGML_MAX_NAME #define GGML_MAX_NAME 64 diff --git a/llama.cpp b/llama.cpp index 9e0343cad..96b2adbbe 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2001,6 +2001,10 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; +#ifdef GGML_USE_MPI + ggml_mpi_context * ctx_mpi = nullptr; +#endif + ~llama_model() { for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); @@ -2099,9 +2103,7 @@ struct llama_context { struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] -#ifdef GGML_USE_MPI - ggml_mpi_context * ctx_mpi = NULL; -#endif + }; // @@ -3277,6 +3279,11 @@ static void llm_load_hparams( auto & hparams = model.hparams; const gguf_context * ctx = ml.ctx_gguf; +#ifdef GGML_USE_MPI + model.ctx_mpi = ggml_mpi_init(); + +#endif + // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { enum gguf_type type = gguf_get_kv_type(ctx, i); @@ -4008,6 +4015,7 @@ static bool llm_load_tensors( enum llama_split_mode split_mode, int main_gpu, const float * tensor_split, + const float * node_split, bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { @@ -4097,11 +4105,17 @@ static bool llm_load_tensors( } #ifdef GGML_USE_MPI - // TESTING: Setting all non-input/output layers to node 1 - for (int64_t i = 0; i < n_layer; i++) { - ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1); - ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1); + uint16_t** ranges = ggml_mpi_split_range(model.ctx_mpi, 0, n_layer - 1, node_split); + + size_t size = ggml_mpi_size(model.ctx_mpi); + + for (size_t i = 0; i < size; i++) { + for (uint16_t j = ranges[i][0]; j < ranges[i][1]; j++) { + printf("Setting buffer rank for i %zu and j %d\n", i, j); + ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft, (int)i); + ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft_matrix, (int)i); + } } @@ -5101,7 +5115,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam #endif if (!llm_load_tensors( - ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, + ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.node_layer_weights, params.use_mlock, params.progress_callback, params.progress_callback_user_data )) { return -2; @@ -8813,7 +8827,7 @@ static int llama_decode_internal( uint32_t n_tokens_all = batch_all.n_tokens; #ifdef GGML_USE_MPI - ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max); + ggml_mpi_eval_init(lctx.model.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max); n_tokens_all = batch_all.n_tokens; #endif @@ -9003,7 +9017,7 @@ static int llama_decode_internal( // update the graphs to skip "result_output" if logits are not needed if (res) { #ifdef GGML_USE_MPI - if (ggml_mpi_rank(lctx.ctx_mpi) == 0) { + if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) { #endif ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); @@ -12636,7 +12650,7 @@ static int llama_apply_lora_from_file_internal( // struct llama_model_params llama_model_default_params() { struct llama_model_params result = { - static_cast(calloc(1, sizeof(int32_t))), + static_cast(calloc(1, sizeof(float))), /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, @@ -12706,7 +12720,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { int llama_node_id(struct llama_context * ctx) { #ifdef GGML_USE_MPI - return ggml_mpi_rank(ctx->ctx_mpi); + return ggml_mpi_rank(ctx->model.ctx_mpi); #endif return 0; @@ -13026,8 +13040,13 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_MPI + std::vector new_backends; - ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)}; + for (size_t i = 0; i < ggml_mpi_size(model->ctx_mpi); i++) { + new_backends.push_back(ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), (int) i)); + } + + ctx->backends = new_backends; @@ -13144,23 +13163,13 @@ struct llama_context * llama_new_context_with_model( } } -#ifdef GGML_USE_MPI - ctx->ctx_mpi = ggml_mpi_init(); -#endif return ctx; } void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) { -//#ifdef GGML_USE_MPI -// if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) { -// GGML_ASSERT(false && "Must have same number of split percentages as devices"); -// } -// uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights); -// ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges); -// free(ranges); -//#endif + } void llama_free(struct llama_context * ctx) { @@ -13998,7 +14007,7 @@ int32_t llama_decode( struct llama_batch batch) { #ifdef GGML_USE_MPI - if (ggml_mpi_rank(ctx->ctx_mpi) > 0) { + if (ggml_mpi_rank(ctx->model.ctx_mpi) > 0) { // Enter a blocking eval loop with dummy input, letting rank=0 drive the process const int n_ctx = llama_n_ctx(ctx); std::vector tmp(n_ctx, llama_token_bos(&ctx->model)); diff --git a/llama.h b/llama.h index 2f2e775ca..48ac9a324 100644 --- a/llama.h +++ b/llama.h @@ -203,7 +203,7 @@ extern "C" { struct llama_model_params { // Array of layers to allocate to each node - int32_t* n_node_layers; + const float * node_layer_weights; int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs