Make MUL_MAT initialization go fast

Using an atomic to delegate slices of the matrix to separate threads is
slow, because all the threads have to contend for the same memory spot.
The right thing to do here is use a `chore` variable, where all threads
perform the same computation independently.

This change introduces the ggml_once() and ggml_syncthreads() functions
which works the exact same way as CUDA. This is nice, since it means if
BLAS or LLAMAFILE doesn't need `B` requantized, then it can skip paying
for the synchronization barrier between the INIT and COMPUTE phases. We
can further refactor along this path, to remove all the INIT/COMP/FINAL
code too. All ops can should be in charge of their own synchronization.
This commit is contained in:
Justine Tunney 2024-05-22 00:20:24 -07:00
parent ae6ee0b777
commit 7deec14bd9
No known key found for this signature in database
GPG key ID: 52965314629936D4
2 changed files with 105 additions and 50 deletions

147
ggml.c
View file

@ -1719,6 +1719,59 @@ static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
#endif
//
// synchronization primitives
//
struct ggml_once {
atomic_int state;
};
struct ggml_barrier {
atomic_uint phase;
atomic_int count;
};
void ggml_once(struct ggml_once * once, void init(void)) {
uint32_t old = atomic_load_explicit(&once->state, memory_order_acquire);
if (!old && atomic_compare_exchange_strong_explicit(&once->state, &old, 1,
memory_order_acquire,
memory_order_relaxed)) {
init();
atomic_store_explicit(&once->state, 2, memory_order_release);
return;
}
while (old == 1) {
old = atomic_load_explicit(&once->state, memory_order_acquire);
}
}
int ggml_delay(int backoff) {
if (backoff < 12) {
volatile int i;
for (i = 0; i != 1 << backoff; i++) {
}
backoff++;
} else {
sched_yield();
}
return backoff;
}
// creates barrier and blocks until all threads call this
void ggml_syncthreads(struct ggml_barrier * b, int nth) {
unsigned phase = atomic_load_explicit(&b->phase, memory_order_relaxed);
if (atomic_fetch_add_explicit(&b->count, 1, memory_order_acq_rel) + 1 == nth) {
atomic_store_explicit(&b->count, 0, memory_order_relaxed);
atomic_store_explicit(&b->phase, phase + 1, memory_order_release);
} else {
int backoff = 0;
while (atomic_load_explicit(&b->phase, memory_order_acquire) == phase) {
backoff = ggml_delay(backoff);
}
}
}
//
// fundamental operations
//
@ -2783,7 +2836,6 @@ static void ggml_setup_op_has_task_pass(void) {
bool * p = GGML_OP_HAS_INIT;
p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_MUL_MAT_ID ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
@ -12321,7 +12373,7 @@ static void ggml_compute_forward_mul_mat(
#if defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
if (params->ith == 0) {
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
@ -12334,31 +12386,25 @@ static void ggml_compute_forward_mul_mat(
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
UNUSED(desired_wsize);
if (params->type == GGML_TASK_TYPE_INIT) {
if (type != GGML_TYPE_F32) {
assert(params->wsize >= desired_wsize);
// parallelize by src0 rows
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
if (type != GGML_TYPE_F32) {
assert(params->wsize >= desired_wsize);
// parallelize by src0 rows
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
}
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
}
}
}
return;
}
if (params->type == GGML_TASK_TYPE_FINALIZE) {
return;
ggml_syncthreads(params->barrier, params->nth);
}
// perform sgemm, parallelization controlled by blas lib
@ -12417,32 +12463,29 @@ static void ggml_compute_forward_mul_mat(
UseGgmlGemm1:;
#endif
if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) {
return;
}
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
if (src1->type != vec_dot_type) {
char * wdata = params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
assert(params->wsize >= ne11*ne12*ne13*row_size);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
assert(params->wsize >= ne11*ne12*ne13*row_size);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
int chore = 0;
for (int64_t i13 = 0; i13 < ne13; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
if (chore == ith) {
from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
wdata += row_size;
}
if (++chore == nth) {
chore = 0;
}
wdata += row_size;
}
}
}
return;
}
if (params->type == GGML_TASK_TYPE_FINALIZE) {
return;
ggml_syncthreads(params->barrier, params->nth);
}
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
@ -19557,6 +19600,7 @@ struct ggml_compute_state_shared {
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
atomic_int node_task; // active graph node task phase
struct ggml_barrier barrier;
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
void * abort_callback_data;
@ -19882,11 +19926,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
// all other threads are finished and spinning
// do finalize and init here so we don't have synchronize again
struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
/*.ith =*/ 0,
/*.nth =*/ 0,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
/*.ith =*/ 0,
/*.nth =*/ 0,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.barrier =*/ &state->shared->barrier,
};
if (node_n != -1) {
@ -19954,11 +19999,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_TYPE_INIT,
/*.ith =*/ state->ith,
/*.nth =*/ n_tasks,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.type =*/ GGML_TASK_TYPE_INIT,
/*.ith =*/ state->ith,
/*.nth =*/ n_tasks,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.barrier =*/ &state->shared->barrier,
};
if (state->ith < n_tasks) {
@ -20233,6 +20279,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
/*.barrier =*/ {0, 0},
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};

8
ggml.h
View file

@ -680,6 +680,12 @@ extern "C" {
GGML_TASK_TYPE_FINALIZE,
};
struct ggml_once;
struct ggml_barrier;
int ggml_delay(int backoff);
void ggml_syncthreads(struct ggml_barrier * b, int nth);
void ggml_once(struct ggml_once * once, void init(void));
struct ggml_compute_params {
enum ggml_task_type type;
@ -689,6 +695,8 @@ extern "C" {
// work buffer for all threads
size_t wsize;
void * wdata;
struct ggml_barrier *barrier;
};
// numa strategies