Wrap backends with MPI backend

This commit is contained in:
Branden Butler 2024-02-19 12:21:48 -06:00
parent b98274c76f
commit 968cefb4a9
4 changed files with 74 additions and 112 deletions

View file

@ -22,6 +22,7 @@ struct ggml_mpi_context {
int layer_end;
struct ggml_tensor *inp0;
std::string name;
struct ggml_backend * wrapped_backend;
};
void ggml_mpi_backend_init(void) {
@ -247,8 +248,6 @@ void ggml_mpi_scatter_layers(
}
void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * gf, int n_layers) {
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
if (inp_tokens == NULL) {
@ -286,73 +285,22 @@ void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml
}
{
//const int n_per_node = (n_layers + (mpi_size - 1)) / mpi_size;
const int mpi_idx = mpi_rank > 0 ? mpi_rank - 1 : mpi_size - 1;
//const int il0 = (mpi_idx + 0) * n_per_node;
//const int il1 = MIN(n_layers, (mpi_idx + 1) * n_per_node);
int il0 = ctx_mpi->layer_start;
int il1 = MIN(n_layers, ctx_mpi->layer_end);
char name_l0[GGML_MAX_NAME];
char name_l1[GGML_MAX_NAME];
snprintf(name_l0, sizeof(name_l0), "layer_inp_%d", il0);
snprintf(name_l1, sizeof(name_l1), "layer_inp_%d", il1);
const int idx_l0 = ggml_graph_get_node_idx(gf, name_l0);
const int idx_l1 = mpi_rank > 0 ? ggml_graph_get_node_idx(gf, name_l1) + 1 : gf->n_nodes;
if (idx_l0 < 0 || idx_l1 < 0) {
fprintf(stderr, "%s: layer input nodes not found\n", __func__);
return;
}
// attach the input data to all nodes that need it
// TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
for (int i = idx_l0; i < idx_l1; i++) {
if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
gf->nodes[i]->src[0] = inp0;
}
if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
gf->nodes[i]->src[1] = inp0;
}
}
// TODO: instead of rearranging the nodes, we should be able to execute a subset of the compute graph
for (int i = 1; i < idx_l1 - idx_l0; i++) {
gf->nodes[i] = gf->nodes[idx_l0 + i];
}
// the first node performs the "get_rows" operation, the rest of the nodes get the data from the previous node
if (mpi_idx != 0) {
gf->nodes[0]->op = GGML_OP_NONE;
}
gf->n_nodes = idx_l1 - idx_l0;
}
}
// TODO: there are many improvements that can be done to this implementation
void ggml_mpi_graph_compute_pre(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers) {
struct ggml_cgraph * gf) {
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
struct ggml_tensor * inp_tokens = gf->nodes[0];
if (inp_tokens == NULL) {
fprintf(stderr, "%s: tensor 'inp_tokens' not found\n", __func__);
return;
}
struct ggml_tensor * inp0 = ctx_mpi->inp0;
struct ggml_tensor * inp0 = ggml_graph_get_tensor(gf, "layer_inp_0");
if (inp0 == NULL) {
fprintf(stderr, "%s: tensor 'inp0' not found\n", __func__);
return;
@ -381,9 +329,7 @@ void ggml_mpi_graph_compute_pre(
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers) {
UNUSED(n_layers);
struct ggml_cgraph * gf) {
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
@ -396,9 +342,24 @@ void ggml_mpi_graph_compute_post(
// BACKEND V2
GGML_CALL static bool ggml_backend_mpi_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
struct ggml_mpi_context * ctx = (ggml_mpi_context *) backend->context;
ggml_mpi_graph_compute_pre(ctx, cgraph);
ggml_backend_t wrapped_backend = ctx->wrapped_backend;
bool ret = ggml_backend_graph_compute(wrapped_backend, cgraph);
ggml_mpi_graph_compute_post(ctx, cgraph);
return ret;
}
static const char * ggml_backend_mpi_name(ggml_backend_t backend) {
auto * ctx = static_cast<ggml_mpi_context *>(backend->context);
return ctx->name.c_str();
return ctx->wrapped_backend->iface.get_name(backend);
}
static void ggml_backend_mpi_free(ggml_backend_t backend) {
@ -427,20 +388,6 @@ GGML_CALL static bool ggml_backend_mpi_supports_op(ggml_backend_t backend, const
GGML_UNUSED(backend);
}
static struct ggml_backend_i mpi_backend_i = {
/* .get_name = */ ggml_backend_mpi_name,
/* .free = */ ggml_backend_mpi_free,
/* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_graph_compute,
/* .supports_op = */ ggml_backend_mpi_supports_op,
};
std::vector<ggml_mpi_device> ggml_mpi_available_devices_internal() {
@ -473,23 +420,42 @@ ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type
return ggml_backend_wrapped_buffer_type;
}
ggml_backend_t ggml_backend_mpi_init(int index) {
ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend) {
struct ggml_backend_i mpi_backend_i = {
/* .get_name = */ wrapped_backend->iface.get_name,
/* .free = */ ggml_backend_mpi_free,
/* .get_default_buffer_type = */ ggml_backend_mpi_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_mpi_graph_compute,
/* .supports_op = */ ggml_backend_mpi_supports_op,
};
ggml_mpi_context * ctx = ggml_mpi_init();
ctx->wrapped_backend = wrapped_backend;
auto *mpi_backend = new ggml_backend {
/* .interface = */ mpi_backend_i,
/* .context = */ ggml_mpi_init(),
/* .context = */ ctx,
};
return mpi_backend;
}
static ggml_backend_t ggml_backend_reg_mpi_init(const char * params, void * user_data) {
// TODO check what the parameters are for. Could use it to setup the MPI comms and routes?
GGML_UNUSED(params);
return ggml_backend_mpi_init(intptr_t(user_data));
return ggml_backend_mpi_init(ggml_backend_cpu_init());
}
ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type(int index) {
ggml_backend_buffer_type_t ggml_backend_mpi_buffer_type() {
return ggml_backend_cpu_buffer_type();
}
@ -501,7 +467,7 @@ int ggml_backend_mpi_reg_devices() {
ggml_backend_register(
device.name,
ggml_backend_reg_mpi_init,
ggml_backend_mpi_buffer_type(device.index),
ggml_backend_mpi_buffer_type(),
reinterpret_cast<void *>(intptr_t(device.index))
);
}

View file

@ -53,7 +53,6 @@ struct ggml_mpi_context * ggml_mpi_init(void);
void ggml_mpi_graph_creation_post(struct ggml_mpi_context * ctx_mpi, struct ggml_cgraph * cgraph, int n_layers);
GGML_API ggml_backend_t ggml_backend_mpi_init(int index);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_mpi_wrap_buffer(ggml_backend_buffer_type_t buft);
/**
@ -185,8 +184,7 @@ void ggml_mpi_scatter_layers(
*/
void ggml_mpi_graph_compute_pre(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers);
struct ggml_cgraph * gf);
/**
* Sends the output tensor to the next node for processing
@ -198,8 +196,7 @@ void ggml_mpi_graph_compute_pre(
*/
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers);
struct ggml_cgraph * gf);
// BACKEND V2
@ -213,6 +210,8 @@ struct ggml_mpi_device {
#define MPI_BACKEND_NAME "MPI"
GGML_CALL int ggml_backend_mpi_reg_devices();
GGML_CALL ggml_backend_t ggml_backend_mpi_init(ggml_backend_t wrapped_backend);
#ifdef __cplusplus
}
#endif

View file

@ -4090,15 +4090,15 @@ static bool llm_load_tensors(
}
#ifdef GGML_USE_MPI
for (int64_t i = 0; i < n_layer; i++) {
model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
}
model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
// for (int64_t i = 0; i < n_layer; i++) {
// model.buft_layer[i] = {ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft_matrix),
// ggml_backend_mpi_wrap_buffer(model.buft_layer[i].buft)};
// }
//
// model.buft_input = {ggml_backend_mpi_wrap_buffer(model.buft_input.buft_matrix),
// ggml_backend_mpi_wrap_buffer(model.buft_input.buft)};
// model.buft_output = {ggml_backend_mpi_wrap_buffer(model.buft_output.buft_matrix),
// ggml_backend_mpi_wrap_buffer(model.buft_output.buft)};
#endif
// count used buffer types
@ -8764,10 +8764,7 @@ static void llama_graph_compute(
llama_context & lctx,
ggml_cgraph * gf,
int n_threads) {
#ifdef GGML_USE_MPI
const int64_t n_layer = lctx.model.hparams.n_layer;
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
#endif
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) {
@ -8784,9 +8781,6 @@ static void llama_graph_compute(
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
#ifdef GGML_USE_MPI
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
#endif
}
// decode a batch of tokens by evaluating the transformer
@ -12619,6 +12613,7 @@ static int llama_apply_lora_from_file_internal(
//
struct llama_model_params llama_model_default_params() {
struct llama_model_params result = {
static_cast<int32_t *>(calloc(1, sizeof(int32_t))),
/*.n_gpu_layers =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
@ -12998,18 +12993,7 @@ struct llama_context * llama_new_context_with_model(
}
#endif
#ifdef GGML_USE_MPI
// with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
ggml_backend_t backend = ggml_backend_mpi_init(model->main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
@ -13018,6 +13002,16 @@ struct llama_context * llama_new_context_with_model(
}
ctx->backends.push_back(ctx->backend_cpu);
#ifdef GGML_USE_MPI
for(auto & backend : ctx->backends) {
backend = ggml_backend_mpi_init(backend);
}
ctx->backend_cpu = ctx->backends.back();
#endif
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);

View file

@ -202,6 +202,9 @@ extern "C" {
};
struct llama_model_params {
// Array of layers to allocate to each node
int32_t* n_node_layers;
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs