mpi : add support for distributed inference via MPI (#2099)

* MPI support, first cut

* fix warnings, update README

* fixes

* wrap includes

* PR comments

* Update CMakeLists.txt

* Add GH workflow, fix test

* Add info to README

* mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099)

* mpi : add names for layer inputs + prep ggml_mpi_graph_compute()

* mpi : move all MPI logic into ggml-mpi

Not tested yet

* mpi : various fixes - communication now works but results are wrong

* mpi : fix output tensor after MPI compute (still not working)

* mpi : fix inference

* mpi : minor

* Add OpenMPI to GH action

* [mpi] continue-on-error: true

* mpi : fix after master merge

* [mpi] Link MPI C++ libraries to fix OpenMPI

* tests : fix new llama_backend API

* [mpi] use MPI_INT32_T

* mpi : factor out recv / send in functions and reuse

* mpi : extend API to allow usage with outer backends (e.g. Metal)

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Evan Miller 2023-07-10 11:49:56 -04:00 committed by GitHub
parent 1d16309969
commit 5656d10599
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 460 additions and 35 deletions

View file

@ -19,6 +19,9 @@
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_MPI
#include "ggml-mpi.h"
#endif
#ifdef GGML_USE_K_QUANTS
#ifndef QK_K
#ifdef GGML_QKK_64
@ -352,6 +355,10 @@ struct llama_context {
ggml_metal_context * ctx_metal = NULL;
#endif
#ifdef GGML_USE_MPI
ggml_mpi_context * ctx_mpi = NULL;
#endif
int buf_last = 0;
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@ -870,7 +877,7 @@ bool llama_mlock_supported() {
return llama_mlock::SUPPORTED;
}
void llama_init_backend(bool numa) {
void llama_backend_init(bool numa) {
ggml_time_init();
// needed to initialize f16 tables
@ -883,6 +890,16 @@ void llama_init_backend(bool numa) {
if (numa) {
ggml_numa_init();
}
#ifdef GGML_USE_MPI
ggml_mpi_backend_init();
#endif
}
void llama_backend_free() {
#ifdef GGML_USE_MPI
ggml_mpi_backend_free();
#endif
}
int64_t llama_time_us() {
@ -1284,13 +1301,17 @@ static bool llama_eval_internal(
llama_context & lctx,
const llama_token * tokens,
const float * embd,
const int n_tokens,
const int n_past,
int n_tokens,
int n_past,
int n_threads,
const char * cgraph_fname) {
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
#ifdef GGML_USE_MPI
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
#endif
const int64_t t_start_us = ggml_time_us();
const int N = n_tokens;
@ -1331,11 +1352,16 @@ static bool llama_eval_internal(
struct ggml_tensor * inpL;
if (tokens) {
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd));
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
ggml_set_name(inp_tokens, "inp_tokens");
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else {
#ifdef GGML_USE_MPI
GGML_ASSERT(false && "not implemented");
#endif
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
}
@ -1353,18 +1379,20 @@ static bool llama_eval_internal(
offload_func_t offload_func_v = llama_nop;
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers;
}
if (n_gpu_layers > n_layer + 1) {
offload_func_v = ggml_cuda_assign_buffers;
}
if (n_gpu_layers > n_layer + 2) {
offload_func_kq = ggml_cuda_assign_buffers;
}
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers;
}
if (n_gpu_layers > n_layer + 1) {
offload_func_v = ggml_cuda_assign_buffers;
}
if (n_gpu_layers > n_layer + 2) {
offload_func_kq = ggml_cuda_assign_buffers;
}
#endif // GGML_USE_CUBLAS
for (int il = 0; il < n_layer; ++il) {
ggml_format_name(inpL, "layer_inp_%d", il);
offload_func_t offload_func = llama_nop;
#ifdef GGML_USE_CUBLAS
@ -1571,7 +1599,6 @@ static bool llama_eval_internal(
// input for next layer
inpL = cur;
}
lctx.use_buf(ctx0, 0);
@ -1579,7 +1606,6 @@ static bool llama_eval_internal(
// used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL;
// norm
{
cur = ggml_rms_norm(ctx0, inpL);
@ -1594,7 +1620,6 @@ static bool llama_eval_internal(
embeddings = cur;
}
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
@ -1607,6 +1632,10 @@ static bool llama_eval_internal(
// run the computation
ggml_build_forward_expand(&gf, cur);
#if GGML_USE_MPI
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
#endif
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
@ -1635,6 +1664,15 @@ static bool llama_eval_internal(
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
#endif
#if GGML_USE_MPI
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
#endif
// update kv token count
lctx.kv_self.n = n_past + N;
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
if (cgraph_fname) {
ggml_graph_export(&gf, cgraph_fname);
}
@ -1650,23 +1688,17 @@ static bool llama_eval_internal(
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
//}
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
// update kv token count
lctx.kv_self.n = n_past + N;
// extract logits
{
auto & logits_out = lctx.logits;
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
}
}
@ -2697,6 +2729,18 @@ struct llama_context * llama_new_context_with_model(
}
#endif
#ifdef GGML_USE_MPI
ctx->ctx_mpi = ggml_mpi_init();
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
llama_backend_free();
exit(1);
}
#endif
return ctx;
}