From 8435ab0ae8adb6c7f61bffac690314b9bbb4cfdd Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Wed, 22 May 2024 00:31:28 -0700 Subject: [PATCH] Avoid INIT synchronization barrier when possible This change makes inference go ~5% faster for me. --- ggml.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/ggml.c b/ggml.c index 5abd6180e..15be6a59b 100644 --- a/ggml.c +++ b/ggml.c @@ -20013,19 +20013,21 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_TYPE_COMPUTE; - atomic_store(&state->shared->n_active, n_threads); - atomic_store(&state->shared->node_task, task_phase); - } - else { - // TODO: this sched_yield can have significant impact on the performance - either positive or negative - // depending on the workload and the operating system. - // since it is not clear what is the best approach, it should potentially become user-configurable - // ref: https://github.com/ggerganov/ggml/issues/291 - // UPD: adding the do_yield flag seems to resolve the issue universally - const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; - ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); + if (GGML_OP_HAS_INIT[node->op]) { + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_TYPE_COMPUTE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + // TODO: this sched_yield can have significant impact on the performance - either positive or negative + // depending on the workload and the operating system. + // since it is not clear what is the best approach, it should potentially become user-configurable + // ref: https://github.com/ggerganov/ggml/issues/291 + // UPD: adding the do_yield flag seems to resolve the issue universally + const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; + ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); + } } if (state->ith < n_tasks) {