threading: try to fix a deadlock, also added critical deadlock detection
This commit is contained in:
parent
cc8a375bc4
commit
aac7f7cc04
2 changed files with 37 additions and 26 deletions
|
@ -261,19 +261,22 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
|
||||||
PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n",
|
PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n",
|
||||||
n_worker_threads);
|
n_worker_threads);
|
||||||
|
|
||||||
ggml_spin_lock(&ctx->shared.spin);
|
struct ggml_compute_state_shared *shared = &ctx->shared;
|
||||||
ctx->shared.wait_now = true;
|
|
||||||
|
ggml_spin_lock(&shared->spin);
|
||||||
|
shared->wait_now = true;
|
||||||
ggml_spin_unlock(&ctx->shared.spin);
|
ggml_spin_unlock(&ctx->shared.spin);
|
||||||
|
|
||||||
const int n_worker_threads = ctx->n_threads - 1;
|
const int n_worker_threads = ctx->n_threads - 1;
|
||||||
while (ctx->shared.n_waiting != n_worker_threads) {
|
while (shared->n_waiting != n_worker_threads) {
|
||||||
ggml_spin_pause();
|
ggml_spin_pause();
|
||||||
}
|
}
|
||||||
|
|
||||||
PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
|
PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
|
||||||
ggml_spin_lock(&ctx->shared.spin);
|
|
||||||
|
ggml_spin_lock(&shared->spin);
|
||||||
ctx->suspending = true;
|
ctx->suspending = true;
|
||||||
ggml_spin_unlock(&ctx->shared.spin);
|
ggml_spin_unlock(&shared->spin);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wakeup all workers.
|
// Wakeup all workers.
|
||||||
|
@ -296,44 +299,52 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
|
||||||
perf_time_0 = ggml_time_us();
|
perf_time_0 = ggml_time_us();
|
||||||
}
|
}
|
||||||
|
|
||||||
int loop_counter = 0;
|
// Dead lock detection.
|
||||||
int64_t last_signal_time = 0;
|
int counter = 0;
|
||||||
|
int64_t last_notify_ms = 0;
|
||||||
|
const int max_notify_count = ctx->n_threads - 1;
|
||||||
|
const int max_duration_ms = 100 * max_notify_count;
|
||||||
|
|
||||||
ggml_spin_lock(&shared->spin);
|
ggml_spin_lock(&shared->spin);
|
||||||
shared->wait_now = false;
|
shared->wait_now = false;
|
||||||
|
|
||||||
while (shared->n_waiting != 0) {
|
while (shared->n_waiting != 0) {
|
||||||
ggml_spin_unlock(&shared->spin);
|
|
||||||
|
|
||||||
if (loop_counter > 0) {
|
|
||||||
ggml_spin_pause();
|
|
||||||
if (loop_counter > 3) {
|
|
||||||
sched_yield();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
++loop_counter;
|
|
||||||
|
|
||||||
// TODO: should bench actual average wait/wakeup time.
|
|
||||||
if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
|
GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
|
||||||
|
if (shared->n_waiting == 0) {
|
||||||
|
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
|
||||||
|
ggml_spin_unlock(&shared->spin);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_spin_unlock(&shared->spin);
|
||||||
|
|
||||||
GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
|
GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
|
||||||
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
|
GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
|
||||||
last_signal_time = ggml_time_us();
|
last_notify_ms = ggml_time_ms();
|
||||||
|
|
||||||
|
sched_yield();
|
||||||
|
|
||||||
|
int elapsed = last_notify_ms > 0 ? ggml_time_ms() - last_notify_ms : 0;
|
||||||
|
|
||||||
|
if ((counter > max_notify_count) || elapsed > max_duration_ms) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"[ggml-threading] potential dead lock detected, notified "
|
||||||
|
"for %d times, elapsed time: %d ms, abort\n",
|
||||||
|
counter, elapsed);
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_spin_lock(&shared->spin);
|
ggml_spin_lock(&shared->spin);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->suspending = false;
|
ctx->suspending = false;
|
||||||
|
ggml_spin_unlock(&shared->spin);
|
||||||
|
|
||||||
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
|
if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
|
||||||
ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
|
ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
|
||||||
perf_time_0);
|
perf_time_0);
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_spin_unlock(&shared->spin);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) {
|
bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) {
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define MAX_N_THREADS 16
|
#define MAX_N_THREADS 64
|
||||||
|
|
||||||
static const int n_repeat = 10;
|
static const int n_repeat = 10;
|
||||||
|
|
||||||
|
@ -353,7 +353,7 @@ int main(void) {
|
||||||
// average time, thus greatly punishes those small workloads.
|
// average time, thus greatly punishes those small workloads.
|
||||||
// - wait_on_done is general faster than wait_now, can be 10x faster.
|
// - wait_on_done is general faster than wait_now, can be 10x faster.
|
||||||
|
|
||||||
int threads_arr[] = {1, 2, 4, 6, 8, 16};
|
int threads_arr[] = {1, 2, 4, 6, 8, 16, 32, 64};
|
||||||
int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
|
int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
|
||||||
|
|
||||||
// millions of loops.
|
// millions of loops.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue