From 928e0b7013c862cf10701957b3d654aa70f11bd8 Mon Sep 17 00:00:00 2001 From: agray3 Date: Fri, 26 Apr 2024 19:08:30 +0100 Subject: [PATCH] Reset schedule earlier to allow overlap with ggml graph computation on device (#6933) * Reset schedule earlier to allow overlap with graph computation on device --- ggml-backend.c | 12 +++++++----- llama.cpp | 4 ++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index e91d97cd9..f5bdcf078 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT - memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); - memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); + if (!sched->is_reset) { + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT + memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); + memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - sched->is_reset = true; + sched->is_reset = true; + } sched->is_alloc = false; } diff --git a/llama.cpp b/llama.cpp index dd8b1f264..49f2b559e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11473,6 +11473,10 @@ static int llama_decode_internal( } } + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(lctx.sched); + return 0; }