Remove hard-coded layer splits and support more than 2 nodes
This commit is contained in:
parent
5f156f3a0c
commit
4692644ff9
6 changed files with 75 additions and 60 deletions
|
@ -1319,6 +1319,10 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
mparams.kv_overrides = params.kv_overrides.data();
|
||||
}
|
||||
|
||||
free((void *) mparams.node_layer_weights);
|
||||
|
||||
mparams.node_layer_weights = params.mpi_layer_split.data();
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
||||
|
|
66
ggml-mpi.cpp
66
ggml-mpi.cpp
|
@ -285,34 +285,27 @@ uint16_t** ggml_mpi_split_range(
|
|||
struct ggml_mpi_context * ctx_mpi,
|
||||
uint16_t start,
|
||||
uint16_t end,
|
||||
float node_weights[]
|
||||
const float node_weights[]
|
||||
) {
|
||||
// Splits the range given by start and end
|
||||
// over the available nodes. This implementation
|
||||
// assumes that node 0 handles the final part of the range
|
||||
// while node 1 handles the beginning, to form a ring pipeline
|
||||
|
||||
// Only node 0 deals with the device splits, other nodes
|
||||
// get the splits from the scatter layers operation
|
||||
|
||||
if (ctx_mpi->rank != 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
uint16_t range_length = end - start + 1;
|
||||
uint16_t ** ranges = (uint16_t**) malloc(sizeof(uint16_t*) * ctx_mpi->size);
|
||||
for (int i = 0; i < ctx_mpi->size; i++) {
|
||||
ranges[i] = (uint16_t*) malloc(sizeof(uint16_t) * 2);
|
||||
}
|
||||
uint16_t next_layer = 0;
|
||||
for (int i=1; i < ctx_mpi->size; i++) {
|
||||
for (int i=0; i < ctx_mpi->size; i++) {
|
||||
ranges[i][0] = next_layer;
|
||||
ranges[i][1] = MIN(end, ranges[i][0] + (node_weights[i] * range_length) + start);
|
||||
next_layer = ranges[i][1];
|
||||
}
|
||||
|
||||
ranges[0][0] = next_layer;
|
||||
ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
|
||||
// ranges[0][0] = next_layer;
|
||||
// ranges[0][1] = MIN(end, next_layer + (node_weights[0] * range_length) + start);
|
||||
return ranges;
|
||||
|
||||
}
|
||||
|
@ -775,8 +768,13 @@ GGML_CALL static void ggml_backend_mpi_buffer_free_buffer(ggml_backend_buffer_t
|
|||
|
||||
GGML_CALL static void ggml_backend_mpi_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
auto * ctx = (ggml_backend_mpi_buffer_context *) buffer->context;
|
||||
|
||||
if (ggml_backend_mpi_buffer_rank(buffer) != ggml_backend_mpi_buffer_local_rank(buffer)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// fprintf(stderr, "SETTING TENSOR WITHOUT MPI CALLS FOR %s (%s) AND TGT BUFFER %s\n", tensor->name, ggml_backend_buffer_name(tensor->buffer), ggml_backend_buffer_name(buffer));
|
||||
return ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
|
||||
ctx->wrapped_buffer->iface.set_tensor(ctx->wrapped_buffer, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
|
@ -794,8 +792,12 @@ GGML_CALL static void ggml_backend_mpi_buffer_get_tensor(ggml_backend_buffer_t b
|
|||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_mpi_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
if (ggml_backend_mpi_buffer_rank(src->buffer) == ggml_backend_mpi_buffer_rank(dst->buffer)) {
|
||||
return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src,
|
||||
dst);
|
||||
}
|
||||
|
||||
return ggml_backend_mpi_buffer_unwrap(buffer)->iface.cpy_tensor(ggml_backend_mpi_buffer_unwrap(buffer), src, dst);
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_mpi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
|
@ -849,25 +851,25 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer
|
|||
}
|
||||
|
||||
bool ggml_backend_mpi_cpy_tensor_async(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
|
||||
int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
|
||||
|
||||
auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
|
||||
|
||||
if (ctx->remote) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (src_rank == dst_rank) {
|
||||
// src->buffer->iface.cpy_tensor(src->buffer, src, dst);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (src_rank == ctx->rank) {
|
||||
ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
|
||||
} else if (dst_rank == ctx->rank){
|
||||
ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
|
||||
}
|
||||
// int src_rank = ggml_backend_mpi_buffer_rank(src->buffer);
|
||||
// int dst_rank = ggml_backend_mpi_buffer_rank(dst->buffer);
|
||||
//
|
||||
// auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
|
||||
//
|
||||
// if (ctx->remote) {
|
||||
// return true;
|
||||
// }
|
||||
//
|
||||
// if (src_rank == dst_rank) {
|
||||
//// src->buffer->iface.cpy_tensor(src->buffer, src, dst);
|
||||
// return true;
|
||||
// }
|
||||
//
|
||||
// if (src_rank == ggml_backend_mpi_local_rank(backend)) {
|
||||
// ggml_mpi_tensor_send(src, dst_rank, ctx->comm);
|
||||
// } else if (dst_rank == ggml_backend_mpi_local_rank(backend)){
|
||||
// ggml_mpi_tensor_recv(dst, src_rank, ctx->comm);
|
||||
// }
|
||||
return true;
|
||||
|
||||
}
|
||||
|
|
|
@ -202,7 +202,7 @@ uint16_t** ggml_mpi_split_range(
|
|||
struct ggml_mpi_context * ctx_mpi,
|
||||
uint16_t start,
|
||||
uint16_t end,
|
||||
float node_weights[]
|
||||
const float node_weights[]
|
||||
);
|
||||
|
||||
// BACKEND V2
|
||||
|
|
2
ggml.h
2
ggml.h
|
@ -226,7 +226,7 @@
|
|||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_PARAMS 2048
|
||||
#define GGML_MAX_CONTEXTS 128
|
||||
#define GGML_MAX_CONTEXTS 256
|
||||
#define GGML_MAX_SRC 10
|
||||
#ifndef GGML_MAX_NAME
|
||||
#define GGML_MAX_NAME 64
|
||||
|
|
59
llama.cpp
59
llama.cpp
|
@ -2001,6 +2001,10 @@ struct llama_model {
|
|||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = nullptr;
|
||||
#endif
|
||||
|
||||
~llama_model() {
|
||||
for (struct ggml_context * ctx : ctxs) {
|
||||
ggml_free(ctx);
|
||||
|
@ -2099,9 +2103,7 @@ struct llama_context {
|
|||
struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
|
||||
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -3277,6 +3279,11 @@ static void llm_load_hparams(
|
|||
auto & hparams = model.hparams;
|
||||
const gguf_context * ctx = ml.ctx_gguf;
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
model.ctx_mpi = ggml_mpi_init();
|
||||
|
||||
#endif
|
||||
|
||||
// get metadata as string
|
||||
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||
|
@ -4008,6 +4015,7 @@ static bool llm_load_tensors(
|
|||
enum llama_split_mode split_mode,
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
const float * node_split,
|
||||
bool use_mlock,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
|
@ -4097,11 +4105,17 @@ static bool llm_load_tensors(
|
|||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
// TESTING: Setting all non-input/output layers to node 1
|
||||
for (int64_t i = 0; i < n_layer; i++) {
|
||||
ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft, 1);
|
||||
ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[i].buft_matrix, 1);
|
||||
uint16_t** ranges = ggml_mpi_split_range(model.ctx_mpi, 0, n_layer - 1, node_split);
|
||||
|
||||
|
||||
size_t size = ggml_mpi_size(model.ctx_mpi);
|
||||
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
for (uint16_t j = ranges[i][0]; j < ranges[i][1]; j++) {
|
||||
printf("Setting buffer rank for i %zu and j %d\n", i, j);
|
||||
ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft, (int)i);
|
||||
ggml_backend_mpi_buffer_type_set_rank(model.buft_layer[j].buft_matrix, (int)i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -5101,7 +5115,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|||
#endif
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.node_layer_weights, params.use_mlock,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
)) {
|
||||
return -2;
|
||||
|
@ -8813,7 +8827,7 @@ static int llama_decode_internal(
|
|||
uint32_t n_tokens_all = batch_all.n_tokens;
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_eval_init(lctx.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
|
||||
ggml_mpi_eval_init(lctx.model.ctx_mpi, &(batch_all.n_tokens), &(batch_all.pos), &(batch_all.n_seq_id), &(batch_all.seq_id), &(batch_all.logits), lctx.cparams.n_seq_max);
|
||||
n_tokens_all = batch_all.n_tokens;
|
||||
#endif
|
||||
|
||||
|
@ -9003,7 +9017,7 @@ static int llama_decode_internal(
|
|||
// update the graphs to skip "result_output" if logits are not needed
|
||||
if (res) {
|
||||
#ifdef GGML_USE_MPI
|
||||
if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
|
||||
if (ggml_mpi_rank(lctx.model.ctx_mpi) == 0) {
|
||||
#endif
|
||||
|
||||
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
||||
|
@ -12636,7 +12650,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
//
|
||||
struct llama_model_params llama_model_default_params() {
|
||||
struct llama_model_params result = {
|
||||
static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
|
||||
static_cast<float *>(calloc(1, sizeof(float))),
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
|
@ -12706,7 +12720,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
|
||||
int llama_node_id(struct llama_context * ctx) {
|
||||
#ifdef GGML_USE_MPI
|
||||
return ggml_mpi_rank(ctx->ctx_mpi);
|
||||
return ggml_mpi_rank(ctx->model.ctx_mpi);
|
||||
|
||||
#endif
|
||||
return 0;
|
||||
|
@ -13026,8 +13040,13 @@ struct llama_context * llama_new_context_with_model(
|
|||
|
||||
#ifdef GGML_USE_MPI
|
||||
|
||||
std::vector<ggml_backend_t> new_backends;
|
||||
|
||||
ctx->backends = {ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 1), ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), 0)};
|
||||
for (size_t i = 0; i < ggml_mpi_size(model->ctx_mpi); i++) {
|
||||
new_backends.push_back(ggml_backend_mpi_init(ctx->backends.data(), ctx->backends.size(), (int) i));
|
||||
}
|
||||
|
||||
ctx->backends = new_backends;
|
||||
|
||||
|
||||
|
||||
|
@ -13144,23 +13163,13 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ctx->ctx_mpi = ggml_mpi_init();
|
||||
|
||||
#endif
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void llama_split_layers_weighted(struct llama_context * ctx, float device_weights[], size_t num_weights) {
|
||||
//#ifdef GGML_USE_MPI
|
||||
// if (ggml_mpi_rank(ctx->ctx_mpi) == 0 && ggml_mpi_size(ctx->ctx_mpi) != num_weights) {
|
||||
// GGML_ASSERT(false && "Must have same number of split percentages as devices");
|
||||
// }
|
||||
// uint16_t** ranges = ggml_mpi_split_range(ctx->ctx_mpi, 0, ctx->model.hparams.n_layer - 1, device_weights);
|
||||
// ggml_mpi_scatter_layers(ctx->ctx_mpi, ranges);
|
||||
// free(ranges);
|
||||
//#endif
|
||||
|
||||
}
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
|
@ -13998,7 +14007,7 @@ int32_t llama_decode(
|
|||
struct llama_batch batch) {
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
||||
if (ggml_mpi_rank(ctx->model.ctx_mpi) > 0) {
|
||||
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
std::vector<llama_token> tmp(n_ctx, llama_token_bos(&ctx->model));
|
||||
|
|
2
llama.h
2
llama.h
|
@ -203,7 +203,7 @@ extern "C" {
|
|||
|
||||
struct llama_model_params {
|
||||
// Array of layers to allocate to each node
|
||||
int32_t* n_node_layers;
|
||||
const float * node_layer_weights;
|
||||
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue