diff --git a/ggml-threading.c b/ggml-threading.c
index 4a9cf622f..146831859 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -167,7 +167,6 @@ struct ggml_compute_state_shared {
 
     // commands.
     atomic_bool wait_now;
-    atomic_bool wait_on_done;
     atomic_bool stop;
 
     // Default task runner, can be overriden by node.task_profile.runner.
@@ -263,11 +262,12 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
 
     struct ggml_compute_state_shared *shared = &ctx->shared;
 
+    const int n_worker_threads = ctx->n_threads - 1;
+
     ggml_spin_lock(&shared->spin);
     shared->wait_now = true;
     ggml_spin_unlock(&ctx->shared.spin);
 
-    const int n_worker_threads = ctx->n_threads - 1;
     while (shared->n_waiting != n_worker_threads) {
         ggml_spin_pause();
     }
@@ -281,8 +281,8 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
 
 // Wakeup all workers.
 //
-// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
-// is used to avoid signal frequently. Current implementation is highly
+// Workers takes some time to wakeup.
+// Yield is used to avoid notify frequently. Current implementation is highly
 // experimental. See tests/test-ggml-threading.c for details.
 void ggml_threading_resume(struct ggml_threading_context *ctx) {
     if (ctx->n_threads == 1) {
@@ -302,14 +302,20 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
     // Dead lock detection.
     int counter = 0;
     int64_t last_notify_ms = 0;
-    const int max_notify_count = ctx->n_threads - 1;
-    const int max_duration_ms = 100 * max_notify_count;
+    const int max_notify_count = 50;
+    const int max_duration_ms = 1000;
+
+    if (shared->n_waiting != 0 && shared->n_waiting != ctx->n_threads - 1) {
+        fprintf(stderr,
+                "[ggml-threading] expected n_waiting is 0 or %d, actual %d, abort\n",
+                ctx->n_threads - 1, shared->n_waiting);
+        abort();
+    }
 
     ggml_spin_lock(&shared->spin);
     shared->wait_now = false;
 
     while (shared->n_waiting != 0) {
-
         GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
         if (shared->n_waiting == 0) {
             GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
@@ -323,7 +329,11 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
         GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
         last_notify_ms = ggml_time_ms();
 
-        sched_yield();
+        if (counter % 1 == 0) {
+            ggml_spin_pause();
+        } else {
+            sched_yield();
+        }
 
         int elapsed = last_notify_ms > 0 ? ggml_time_ms() - last_notify_ms : 0;
 
@@ -372,24 +382,6 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
             ggml_threading_resume(ctx);
             ggml_spin_lock(&shared->spin);
         }
-
-        if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
-            // Optimize energy: wait_on_done. We MAY also check following nodes,
-            // but that's a bit complicated.
-            shared->wait_on_done = false;
-            for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) {
-                struct ggml_task_stage *next = &profile->stages[i];
-                if (next->parallel) {
-                    break;
-                }
-                if (next->wait) {
-                    shared->wait_on_done = true;
-                    PRINT_DEBUG("[main] wait_on_done is enabled for "
-                                "current task stage\n");
-                    break;
-                }
-            }
-        }
     } else if (current->wait) {
         if (shared->n_waiting < n_worker_threads) {
             ggml_spin_unlock(&ctx->shared.spin);
@@ -437,17 +429,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
             state->has_work = false;
             shared->n_tasks--;
 
-            bool wait = shared->wait_on_done && !state->has_work;
-            if (wait) {
-                ggml_threading_cond_wait(state);
-            }
-
             ggml_spin_unlock(&shared->spin);
-
-            // no need to pause.
-            if (wait) {
-                continue;
-            }
         }
 
         ggml_spin_pause();
@@ -594,11 +576,11 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
         .n_tasks = 0,
         .n_waiting = 0,
         .wait_now = false,
-        .wait_on_done = false,
         .stop = false,
         .task_runner = task_runner,
         .ctx = ctx,
     };
+    atomic_flag_clear(&ctx->shared.spin);
 
     PRINT_DEBUG("[main] thread start, features: %d\n", features);
 
diff --git a/ggml-threading.h b/ggml-threading.h
index 012c9cd50..f226b0019 100644
--- a/ggml-threading.h
+++ b/ggml-threading.h
@@ -17,8 +17,7 @@ struct ggml_threading_context;
 // Optional (experimental) features.
 enum ggml_threading_features {
     GGML_THREADING_FEATURE_NONE = 0,
-    GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0,
-    GGML_THREADING_FEATURE_PERF = 1 << 1,
+    GGML_THREADING_FEATURE_PERF = 1,
 };
 
 // The thread runner to feed into OS threads.
diff --git a/ggml-tune.c b/ggml-tune.c
index 36c44e1dc..7aa9c217c 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -802,7 +802,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
     struct ggml_threading_context *thrd_ctx =
         ggml_threading_start(tune->n_threads, NULL, NULL,
-                             GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
+                             GGML_THREADING_FEATURE_NONE, stages_time);
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
diff --git a/ggml.c b/ggml.c
index 75a562481..614212b67 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16203,14 +16203,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
 
         // ~ 50 us
-        //printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
+        // printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
     }
 
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
     struct ggml_threading_context *thrd_ctx = ggml_threading_start(n_threads,
-        NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+        NULL, ggml_compute_forward, GGML_THREADING_FEATURE_NONE, NULL);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 8fc705a6b..cfb5b7a8e 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -71,12 +71,7 @@ static int test_driver(int id, struct ggml_tensor *node, int n_threads) {
         work_done_arr[i] = 0;
     }
 
-    bool wait_on_done = (node->task_profile.dev_flags[0] > 0u);
-
     enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF;
-    if (wait_on_done) {
-        features |= GGML_THREADING_FEATURE_WAIT_ON_DONE;
-    }
 
     int t0 = (int)ggml_time_us();
 
@@ -118,9 +113,9 @@ static int test_driver(int id, struct ggml_tensor *node, int n_threads) {
     }
 
     printf("\tstage-0: parallel: %d, wait: %d\n\tstage-1: parallel: %d, wait: "
-           "%d, wait_on_done: %d %s\n",
+           "%d\n",
            stages[0].parallel, stages[0].wait, stages[1].parallel,
-           stages[1].wait, wait_on_done, stages[1].wait ? "<--------" : "");
+           stages[1].wait);
 
     if (actual == expect) {
         printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
@@ -214,7 +209,7 @@ lifecycle_runner(const struct ggml_compute_params *params,
 }
 
 // Test thread lifecycle: start -> suspend -> resume -> stop
-static int test_lifecycle(bool wait_on_done) {
+static int test_lifecycle() {
     struct ggml_tensor node;
     memset(&node, 0, sizeof(struct ggml_tensor));
 
@@ -243,9 +238,7 @@ static int test_lifecycle(bool wait_on_done) {
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
     int n_threads = 1;
 
-    enum ggml_threading_features features =
-        wait_on_done ? GGML_THREADING_FEATURE_NONE
-                     : GGML_THREADING_FEATURE_WAIT_ON_DONE;
+    enum ggml_threading_features features = GGML_THREADING_FEATURE_NONE;
     for (int i = 0; i < threads_arr_len; i++) {
         n_threads = threads_arr[i];
         int start_time = (int)ggml_time_ms();
@@ -351,7 +344,6 @@ int main(void) {
     // physical cores:
     // - the wait/wakeup time varies much: can be up to tens or hundreds of the
     //   average time, thus greatly punishes those small workloads.
-    // - wait_on_done is general faster than wait_now, can be 10x faster.
 
     int threads_arr[] = {1, 2, 4, 6, 8, 16, 32, 64};
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
@@ -408,7 +400,7 @@ int main(void) {
         }
     }
 
-    // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
+    // node.task_profile.dev_flags: byte 0 unused, byte 1 for loops.
 
     for (int x = 0; x < workload_arr_len; x++) {
         node.task_profile.dev_flags[1] = workload_arr[x];
@@ -423,7 +415,7 @@ int main(void) {
                    "n_threads: %2d ====\n",
                    workload_arr[x], n_threads);
 
-            // multi-threads: parallel + wait_now/wait_on_done
+            // multi-threads: parallel + wait
 
             if (n_threads == 1) {
                 stages[0].parallel = false;
@@ -489,22 +481,11 @@ int main(void) {
                         abort();
                     }
 
-                    { // disable wait_on_done
-                        node.task_profile.dev_flags[0] = 0u; // wait now.
+                    node.task_profile.dev_flags[0] = 0u; // wait now.
 
-                        n_tests++;
-                        if (test_driver(n_tests, &node, n_threads) == 0) {
-                            n_passed++;
-                        }
-                    }
-
-                    { // enable wait_on_done
-                        node.task_profile.dev_flags[0] = 1u; // wait on done
-
-                        n_tests++;
-                        if (test_driver(n_tests, &node, n_threads) == 0) {
-                            n_passed++;
-                        }
+                    n_tests++;
+                    if (test_driver(n_tests, &node, n_threads) == 0) {
+                        n_passed++;
                     }
                 }
             }
@@ -548,18 +529,12 @@ int main(void) {
     }
 
     // lifecycle.
-    for (int i = 0; i < 2; i++) {
-        bool wait_on_done = (i == 1);
-        printf("[test-ggml-threading] test lifecycle (wait_on_done = %d) ...\n",
-               wait_on_done);
-        ++n_tests;
+    printf("[test-ggml-threading] test lifecycle ...\n");
+    ++n_tests;
 
-        if (test_lifecycle(wait_on_done) == 0) {
-            ++n_passed;
-            printf("[test-ggml-threading] test lifecycle (wait_on_done = %d): "
-                   "ok\n\n",
-                   wait_on_done);
-        }
+    if (test_lifecycle() == 0) {
+        ++n_passed;
+        printf("[test-ggml-threading] test lifecycle: ok\n\n");
     }
 
     printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);