pipeline parallelism demo

This commit is contained in:
slaren 2024-01-13 04:13:31 +01:00
parent f172de03f1
commit dbbaf82758
4 changed files with 261 additions and 189 deletions

View file

@ -1149,7 +1149,8 @@ int main(int argc, char ** argv) {
// warmup run // warmup run
if (t.n_prompt > 0) { if (t.n_prompt > 0) {
test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); //test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
test_prompt(ctx, std::min(t.n_prompt, 32), 0, t.n_batch, t.n_threads);
} }
if (t.n_gen > 0) { if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads); test_gen(ctx, 1, 0, t.n_threads);

View file

@ -319,6 +319,13 @@ struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
return alloc->buffer; return alloc->buffer;
} }
void ggml_tallocr_set_buffer(ggml_tallocr_t talloc, struct ggml_backend_buffer * buffer) {
talloc->buffer = buffer;
talloc->base = ggml_backend_buffer_get_base(buffer);
talloc->alignment = ggml_backend_buffer_get_alignment(buffer);
ggml_tallocr_reset(talloc);
}
void ggml_tallocr_free(ggml_tallocr_t alloc) { void ggml_tallocr_free(ggml_tallocr_t alloc) {
if (alloc == NULL) { if (alloc == NULL) {
return; return;

View file

@ -59,6 +59,7 @@ GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_b
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend); GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc); GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
GGML_API void ggml_tallocr_set_buffer(ggml_tallocr_t talloc, struct ggml_backend_buffer * buffer);
GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc); GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc); GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);

195
llama.cpp
View file

@ -1663,7 +1663,9 @@ struct llama_context {
std::vector<uint8_t> buf_compute_meta; std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr; ggml_backend_sched_t sched = nullptr;
// allocator for the input tensors // allocator for the input tensors
ggml_tallocr * alloc = nullptr; ggml_tallocr * alloc_cpu = nullptr;
std::vector<ggml_backend_buffer_t> buf_cpu_ub;
// temporary buffer for copying data to/from the backend // temporary buffer for copying data to/from the backend
std::vector<no_init<uint8_t>> buf_copy; std::vector<no_init<uint8_t>> buf_copy;
@ -3208,7 +3210,8 @@ static bool llm_load_tensors(
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
// there is very little benefit to offloading the input layer, so always keep it on the CPU // there is very little benefit to offloading the input layer, so always keep it on the CPU
model.buft_input = llama_default_buffer_type_cpu(true); //model.buft_input = llama_default_buffer_type_cpu(true);
model.buft_input = llama_default_buffer_type_offload(main_gpu);
model.buft_layer.resize(n_layer); model.buft_layer.resize(n_layer);
@ -5955,7 +5958,7 @@ static struct ggml_cgraph * llama_build_graph(
const auto & model = lctx.model; const auto & model = lctx.model;
// check if we should build the worst-case graph (for memory measurement) // check if we should build the worst-case graph (for memory measurement)
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc); const bool worst_case = ggml_tallocr_is_measure(lctx.alloc_cpu);
// keep track of the input that has already been allocated // keep track of the input that has already been allocated
bool alloc_inp_tokens = false; bool alloc_inp_tokens = false;
@ -5978,9 +5981,9 @@ static struct ggml_cgraph * llama_build_graph(
// //
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
ggml_tallocr_alloc(lctx.alloc, cur); ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) { if (!ggml_tallocr_is_measure(lctx.alloc_cpu) && batch.token) {
const int64_t n_tokens = cur->ne[0]; const int64_t n_tokens = cur->ne[0];
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur)); ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
@ -5990,9 +5993,9 @@ static struct ggml_cgraph * llama_build_graph(
} }
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) { if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
ggml_tallocr_alloc(lctx.alloc, cur); ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) { if (!ggml_tallocr_is_measure(lctx.alloc_cpu) && batch.embd) {
const int64_t n_embd = cur->ne[0]; const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1]; const int64_t n_tokens = cur->ne[1];
@ -6003,9 +6006,9 @@ static struct ggml_cgraph * llama_build_graph(
} }
if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
ggml_tallocr_alloc(lctx.alloc, cur); ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) { if (!ggml_tallocr_is_measure(lctx.alloc_cpu) && batch.pos) {
const int64_t n_tokens = cur->ne[0]; const int64_t n_tokens = cur->ne[0];
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t"); static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
@ -6016,9 +6019,9 @@ static struct ggml_cgraph * llama_build_graph(
} }
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
ggml_tallocr_alloc(lctx.alloc, cur); ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc)) { if (!ggml_tallocr_is_measure(lctx.alloc_cpu)) {
const int64_t n_kv = cur->ne[0]; const int64_t n_kv = cur->ne[0];
const int64_t n_tokens = cur->ne[1]; const int64_t n_tokens = cur->ne[1];
@ -6056,9 +6059,9 @@ static struct ggml_cgraph * llama_build_graph(
} }
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
ggml_tallocr_alloc(lctx.alloc, cur); ggml_tallocr_alloc(lctx.alloc_cpu, cur);
if (!ggml_tallocr_is_measure(lctx.alloc)) { if (!ggml_tallocr_is_measure(lctx.alloc_cpu)) {
const int64_t n_ctx = cur->ne[0]; const int64_t n_ctx = cur->ne[0];
int32_t * data; int32_t * data;
@ -6161,10 +6164,11 @@ static struct ggml_cgraph * llama_build_graph(
// //
static int llama_decode_internal( static int llama_decode_internal(
llama_context & lctx, llama_context & lctx,
llama_batch batch) { llama_batch all_batch) {
const uint32_t n_tokens = batch.n_tokens;
if (n_tokens == 0) { const uint32_t n_tokens_all = all_batch.n_tokens;
if (n_tokens_all == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
return -1; return -1;
} }
@ -6173,12 +6177,11 @@ static int llama_decode_internal(
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams; const auto & cparams = lctx.cparams;
const auto n_batch = cparams.n_batch; //const auto n_batch = cparams.n_batch;
GGML_ASSERT(n_tokens <= n_batch); GGML_ASSERT((!all_batch.token && all_batch.embd) || (all_batch.token && !all_batch.embd)); // NOLINT
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; GGML_ASSERT(n_tokens_all <= cparams.n_ctx);
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
@ -6188,13 +6191,51 @@ static int llama_decode_internal(
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
#endif #endif
GGML_ASSERT(n_threads > 0);
auto & kv_self = lctx.kv_self; auto & kv_self = lctx.kv_self;
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab = hparams.n_vocab;
auto & logits_out = lctx.logits;
if (all_batch.logits) {
logits_out.resize(n_vocab * n_tokens_all);
} else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens_all);
} else {
logits_out.resize(n_vocab);
}
#ifndef NDEBUG
auto & logits_valid = lctx.logits_valid;
logits_valid.clear();
logits_valid.resize(n_tokens_all);
logits_out.clear();
#endif
const uint32_t n_microbatch = 256;
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_microbatch) {
const uint32_t n_tokens = std::min(n_microbatch, n_tokens_all - cur_token);
llama_batch batch = {
/* .n_tokens = */ (int32_t) n_tokens,
/* .token = */ all_batch.token ? all_batch.token + cur_token : nullptr,
/* .embd = */ all_batch.embd ? all_batch.embd + cur_token*n_embd : nullptr,
/* .pos = */ all_batch.pos ? all_batch.pos + cur_token : nullptr,
/* .n_seq_id = */ all_batch.n_seq_id ? all_batch.n_seq_id + cur_token : nullptr,
/* .seq_id = */ all_batch.seq_id ? all_batch.seq_id + cur_token : nullptr,
/* .logits = */ all_batch.logits ? all_batch.logits + cur_token : nullptr,
/* .all_pos_0 = */ all_batch.all_pos_0 + (llama_pos) cur_token*all_batch.all_pos_1,
/* .all_pos_1 = */ all_batch.all_pos_1,
/* .all_seq_id = */ all_batch.all_seq_id,
};
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
GGML_ASSERT(n_threads > 0);
// helpers for smoother batch API transition // helpers for smoother batch API transition
// after deprecating the llama_eval calls, these will be removed // after deprecating the llama_eval calls, these will be removed
std::vector<llama_pos> pos; std::vector<llama_pos> pos;
@ -6234,6 +6275,7 @@ static int llama_decode_internal(
} }
if (!llama_kv_cache_find_slot(kv_self, batch)) { if (!llama_kv_cache_find_slot(kv_self, batch)) {
LLAMA_LOG_ERROR("%s: failed to find a slot in the cache", __func__);
return 1; return 1;
} }
@ -6245,6 +6287,19 @@ static int llama_decode_internal(
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
int i_ub = cur_token / n_microbatch;
size_t n_buf = lctx.buf_cpu_ub.size();
if (i_ub != 0 && i_ub % n_buf == 0) {
// sync all backends
printf("not enough buffers, syncing now\n");
// TODO: ggml_backend_sched_synchronize()
for (auto * backend : lctx.backends) {
ggml_backend_synchronize(backend);
}
}
ggml_tallocr_set_buffer(lctx.alloc_cpu, lctx.buf_cpu_ub[i_ub % n_buf]);
ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_reset(lctx.sched);
ggml_cgraph * gf = llama_build_graph(lctx, batch); ggml_cgraph * gf = llama_build_graph(lctx, batch);
@ -6271,32 +6326,28 @@ static int llama_decode_internal(
n_threads = std::min(4, n_threads); n_threads = std::min(4, n_threads);
} }
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; #ifdef GGML_USE_MPI
if (ggml_cpu_has_cublas() && fully_offloaded) {
n_threads = 1;
}
#ifdef GGML_USE_MPI
const int64_t n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) { if (lctx.backend_metal != nullptr) {
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
} }
#endif #endif
if (lctx.backend_cpu != nullptr) { if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
} }
ggml_backend_sched_graph_compute(lctx.sched, gf); ggml_backend_sched_graph_compute(lctx.sched, gf);
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
#endif #endif
// update the kv ring buffer // update the kv ring buffer
{ {
@ -6315,11 +6366,11 @@ static int llama_decode_internal(
} }
} }
#ifdef GGML_PERF #ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes) // print timing information per ggml operation (for debugging purposes)
// requires GGML_PERF to be defined // requires GGML_PERF to be defined
ggml_graph_print(gf); ggml_graph_print(gf);
#endif #endif
// plot the computation graph in dot format (for debugging purposes) // plot the computation graph in dot format (for debugging purposes)
//if (n_past%100 == 0) { //if (n_past%100 == 0) {
@ -6330,63 +6381,65 @@ static int llama_decode_internal(
// TODO: do not compute and extract logits if only embeddings are needed // TODO: do not compute and extract logits if only embeddings are needed
// need to update the graphs to skip "result_output" // need to update the graphs to skip "result_output"
{ {
auto & logits_out = lctx.logits;
#ifndef NDEBUG
auto & logits_valid = lctx.logits_valid;
logits_valid.clear();
logits_valid.resize(n_tokens);
logits_out.clear();
#endif
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res); ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
GGML_ASSERT(res_backend != nullptr); GGML_ASSERT(res_backend != nullptr);
if (batch.logits) { if (batch.logits) {
logits_out.resize(n_vocab * n_tokens); //logits_out.resize(n_vocab * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) { for (uint32_t i = 0; i < n_tokens; i++) {
if (batch.logits[i] == 0) { if (batch.logits[i] == 0) {
continue; continue;
} }
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + n_vocab*(cur_token + i), n_vocab*i*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG #ifndef NDEBUG
logits_valid[i] = true; logits_valid[i] = true;
#endif #endif
} }
} else if (lctx.logits_all) { } else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens); //logits_out.resize(n_vocab * n_tokens);
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); //ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
#ifndef NDEBUG ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + cur_token*n_vocab, 0, n_vocab*n_tokens*sizeof(float));
#ifndef NDEBUG
std::fill(logits_valid.begin(), logits_valid.end(), true); std::fill(logits_valid.begin(), logits_valid.end(), true);
#endif #endif
} else { } else {
logits_out.resize(n_vocab); if (cur_token + n_tokens >= n_tokens_all) {
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); //logits_out.resize(n_vocab);
#ifndef NDEBUG ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
logits_valid[0] = true;
#endif
} }
ggml_backend_synchronize(res_backend); //ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG
logits_valid[0] = true;
#endif
}
//ggml_backend_synchronize(res_backend);
} }
// FIXME
// extract embeddings // extract embeddings
if (!lctx.embedding.empty()) { if (!lctx.embedding.empty()) {
GGML_ASSERT(!"not implemented");
auto & embedding_out = lctx.embedding; auto & embedding_out = lctx.embedding;
embedding_out.resize(n_embd); embedding_out.resize(n_embd);
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
ggml_backend_synchronize(embeddings_backend); //ggml_backend_synchronize(embeddings_backend);
}
}
// TODO: ggml_backend_sched_synchronize()
for (auto * backend : lctx.backends) {
ggml_backend_synchronize(backend);
} }
// measure the performance only for the single-token evals // measure the performance only for the single-token evals
if (n_tokens == 1) { if (n_tokens_all == 1) {
lctx.t_eval_us += ggml_time_us() - t_start_us; lctx.t_eval_us += ggml_time_us() - t_start_us;
lctx.n_eval++; lctx.n_eval++;
} }
else if (n_tokens > 1) { else if (n_tokens_all > 1) {
lctx.t_p_eval_us += ggml_time_us() - t_start_us; lctx.t_p_eval_us += ggml_time_us() - t_start_us;
lctx.n_p_eval += n_tokens; lctx.n_p_eval += n_tokens_all;
} }
// get a more accurate load time, upon first eval // get a more accurate load time, upon first eval
@ -9402,7 +9455,7 @@ struct llama_context * llama_new_context_with_model(
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
// build worst-case graph // build worst-case graph
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
@ -9415,7 +9468,17 @@ struct llama_context * llama_new_context_with_model(
// note: the number of splits during measure is higher than during inference due to the kv shift // note: the number of splits during measure is higher than during inference due to the kv shift
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); ctx->alloc_cpu = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
// duplicate cpu buffers for microbatching
ggml_backend_buffer_t buf_cpu = ggml_tallocr_get_buffer(ctx->alloc_cpu);
size_t buf_size = ggml_backend_buffer_get_size(buf_cpu);
ctx->buf_cpu_ub.push_back(buf_cpu);
int n_ub = 64;
for (int i = 1; i < n_ub; ++i) {
ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_size);
ctx->buf_cpu_ub.push_back(buf);
}
for (ggml_backend_t backend : ctx->backends) { for (ggml_backend_t backend : ctx->backends) {
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend); ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);