mpi : add support for distributed inference via MPI (#2099)
* MPI support, first cut * fix warnings, update README * fixes * wrap includes * PR comments * Update CMakeLists.txt * Add GH workflow, fix test * Add info to README * mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099) * mpi : add names for layer inputs + prep ggml_mpi_graph_compute() * mpi : move all MPI logic into ggml-mpi Not tested yet * mpi : various fixes - communication now works but results are wrong * mpi : fix output tensor after MPI compute (still not working) * mpi : fix inference * mpi : minor * Add OpenMPI to GH action * [mpi] continue-on-error: true * mpi : fix after master merge * [mpi] Link MPI C++ libraries to fix OpenMPI * tests : fix new llama_backend API * [mpi] use MPI_INT32_T * mpi : factor out recv / send in functions and reuse * mpi : extend API to allow usage with outer backends (e.g. Metal) --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
1d16309969
commit
5656d10599
18 changed files with 460 additions and 35 deletions
98
llama.cpp
98
llama.cpp
|
@ -19,6 +19,9 @@
|
|||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
#ifdef GGML_USE_MPI
|
||||
#include "ggml-mpi.h"
|
||||
#endif
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
#ifndef QK_K
|
||||
#ifdef GGML_QKK_64
|
||||
|
@ -352,6 +355,10 @@ struct llama_context {
|
|||
ggml_metal_context * ctx_metal = NULL;
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
#endif
|
||||
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
||||
|
@ -870,7 +877,7 @@ bool llama_mlock_supported() {
|
|||
return llama_mlock::SUPPORTED;
|
||||
}
|
||||
|
||||
void llama_init_backend(bool numa) {
|
||||
void llama_backend_init(bool numa) {
|
||||
ggml_time_init();
|
||||
|
||||
// needed to initialize f16 tables
|
||||
|
@ -883,6 +890,16 @@ void llama_init_backend(bool numa) {
|
|||
if (numa) {
|
||||
ggml_numa_init();
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_backend_init();
|
||||
#endif
|
||||
}
|
||||
|
||||
void llama_backend_free() {
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_backend_free();
|
||||
#endif
|
||||
}
|
||||
|
||||
int64_t llama_time_us() {
|
||||
|
@ -1284,13 +1301,17 @@ static bool llama_eval_internal(
|
|||
llama_context & lctx,
|
||||
const llama_token * tokens,
|
||||
const float * embd,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads,
|
||||
const char * cgraph_fname) {
|
||||
|
||||
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
||||
#endif
|
||||
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
const int N = n_tokens;
|
||||
|
@ -1331,11 +1352,16 @@ static bool llama_eval_internal(
|
|||
struct ggml_tensor * inpL;
|
||||
|
||||
if (tokens) {
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_set_name(embd, "embd");
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||
ggml_set_name(inp_tokens, "inp_tokens");
|
||||
|
||||
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||
} else {
|
||||
#ifdef GGML_USE_MPI
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
#endif
|
||||
|
||||
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||
}
|
||||
|
@ -1353,18 +1379,20 @@ static bool llama_eval_internal(
|
|||
offload_func_t offload_func_v = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (n_gpu_layers > n_layer) {
|
||||
offload_func_nr = ggml_cuda_assign_buffers;
|
||||
}
|
||||
if (n_gpu_layers > n_layer + 1) {
|
||||
offload_func_v = ggml_cuda_assign_buffers;
|
||||
}
|
||||
if (n_gpu_layers > n_layer + 2) {
|
||||
offload_func_kq = ggml_cuda_assign_buffers;
|
||||
}
|
||||
if (n_gpu_layers > n_layer) {
|
||||
offload_func_nr = ggml_cuda_assign_buffers;
|
||||
}
|
||||
if (n_gpu_layers > n_layer + 1) {
|
||||
offload_func_v = ggml_cuda_assign_buffers;
|
||||
}
|
||||
if (n_gpu_layers > n_layer + 2) {
|
||||
offload_func_kq = ggml_cuda_assign_buffers;
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_format_name(inpL, "layer_inp_%d", il);
|
||||
|
||||
offload_func_t offload_func = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
|
@ -1571,7 +1599,6 @@ static bool llama_eval_internal(
|
|||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
|
||||
}
|
||||
|
||||
lctx.use_buf(ctx0, 0);
|
||||
|
@ -1579,7 +1606,6 @@ static bool llama_eval_internal(
|
|||
// used at the end to optionally extract the embeddings
|
||||
struct ggml_tensor * embeddings = NULL;
|
||||
|
||||
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
|
@ -1594,7 +1620,6 @@ static bool llama_eval_internal(
|
|||
embeddings = cur;
|
||||
}
|
||||
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
ggml_set_name(cur, "result_output");
|
||||
|
@ -1607,6 +1632,10 @@ static bool llama_eval_internal(
|
|||
// run the computation
|
||||
ggml_build_forward_expand(&gf, cur);
|
||||
|
||||
#if GGML_USE_MPI
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (lctx.ctx_metal && N == 1) {
|
||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||
|
@ -1635,6 +1664,15 @@ static bool llama_eval_internal(
|
|||
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||
#endif
|
||||
|
||||
#if GGML_USE_MPI
|
||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
|
||||
#endif
|
||||
|
||||
// update kv token count
|
||||
lctx.kv_self.n = n_past + N;
|
||||
|
||||
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
|
||||
|
||||
if (cgraph_fname) {
|
||||
ggml_graph_export(&gf, cgraph_fname);
|
||||
}
|
||||
|
@ -1650,23 +1688,17 @@ static bool llama_eval_internal(
|
|||
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||
|
||||
// update kv token count
|
||||
lctx.kv_self.n = n_past + N;
|
||||
|
||||
// extract logits
|
||||
{
|
||||
auto & logits_out = lctx.logits;
|
||||
|
||||
if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * N);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
||||
} else {
|
||||
// return result for just the last token
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2697,6 +2729,18 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ctx->ctx_mpi = ggml_mpi_init();
|
||||
|
||||
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
||||
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
||||
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos());
|
||||
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
||||
llama_backend_free();
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue