From e73009ea514ec35189fa98270f096ea8d81093ea Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 12 Jan 2024 03:53:39 +0100 Subject: [PATCH] use async memcpys to copy the graph outputs to the CPU --- ggml-backend.c | 22 ++++++++++++++++++++++ ggml-backend.h | 8 +++++--- llama.cpp | 15 +++++++++++---- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 62ffd0776..564b27c6c 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -772,6 +772,8 @@ struct ggml_backend_sched_split { }; struct ggml_backend_sched { + bool is_reset; // true if the scheduler has been reset since the last graph split + int n_backends; ggml_backend_t backends[GGML_MAX_BACKENDS]; ggml_tallocr_t tallocs[GGML_MAX_BACKENDS]; @@ -968,6 +970,7 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits sched->n_splits = 0; + sched->is_reset = false; struct ggml_init_params params = { /* .mem_size = */ sizeof(sched->context_buffer), @@ -1327,6 +1330,8 @@ static void sched_reset(ggml_backend_sched_t sched) { memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + + sched->is_reset = true; } ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) { @@ -1352,6 +1357,8 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_bac sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]); } + sched_reset(sched); + return sched; } @@ -1389,9 +1396,16 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + if (!sched->is_reset) { + sched_reset(sched); + } + sched_split_graph(sched, graph); sched_alloc_splits(sched); sched_compute_splits(sched); +} + +void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched_reset(sched); } @@ -1417,6 +1431,14 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml node_allocr(node) = sched->tallocs[backend_index]; } +ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { + ggml_tallocr_t allocr = node_allocr(node); + if (allocr == NULL) { + return NULL; + } + return get_allocr_backend(sched, allocr); +} + // utils void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { diff --git a/ggml-backend.h b/ggml-backend.h index 51f98adf8..67c63caaf 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -160,11 +160,13 @@ extern "C" { GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler - GGML_API void ggml_backend_sched_graph_compute( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph); + GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + + // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs + GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // // Utils diff --git a/llama.cpp b/llama.cpp index e3979865c..147885419 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6218,6 +6218,8 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); + ggml_backend_sched_reset(lctx.sched); + ggml_cgraph * gf = llama_build_graph(lctx, batch); // the output is always the last tensor in the graph @@ -6311,30 +6313,33 @@ static int llama_decode_internal( logits_out.clear(); #endif + ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res); + GGML_ASSERT(res_backend != nullptr); if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { if (batch.logits[i] == 0) { continue; } - ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[i] = true; #endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); - ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); #endif } else { logits_out.resize(n_vocab); - ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[0] = true; #endif } + ggml_backend_synchronize(res_backend); } // extract embeddings @@ -6342,7 +6347,9 @@ static int llama_decode_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); + ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + ggml_backend_synchronize(embeddings_backend); } // measure the performance only for the single-token evals