Merge branch 'master' into gg/flash-attn
This commit is contained in:
commit
b3dd7d975f
75 changed files with 4927 additions and 1996 deletions
360
ggml.c
360
ggml.c
|
@ -1478,6 +1478,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||
// TODO: optimize performance
|
||||
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
|
@ -1838,9 +1841,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|||
"GELU",
|
||||
"GELU_QUICK",
|
||||
"SILU",
|
||||
"HARDSWISH",
|
||||
"HARDSIGMOID",
|
||||
};
|
||||
|
||||
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
|
||||
static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
|
@ -4007,6 +4012,20 @@ struct ggml_tensor * ggml_silu_back(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml hardswish
|
||||
struct ggml_tensor * ggml_hardswish(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
|
||||
}
|
||||
|
||||
// ggml hardsigmoid
|
||||
struct ggml_tensor * ggml_hardsigmoid(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
|
||||
}
|
||||
|
||||
// ggml_norm
|
||||
|
||||
static struct ggml_tensor * ggml_norm_impl(
|
||||
|
@ -5408,6 +5427,31 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_depthwise
|
||||
struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
||||
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
||||
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
|
||||
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
||||
|
||||
return result;
|
||||
}
|
||||
// ggml_conv_2d
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
|
@ -7278,6 +7322,17 @@ static void ggml_compute_forward_add_f32(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
if (src1->backend == GGML_BACKEND_GPU) {
|
||||
// TODO: OpenCL kernel support full broadcast
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||
if (ith == 0) {
|
||||
ggml_cl_add(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
@ -7558,7 +7613,12 @@ static void ggml_compute_forward_add(
|
|||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
||||
if (src1->type == GGML_TYPE_F32) {
|
||||
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
|
@ -7879,6 +7939,9 @@ static void ggml_compute_forward_acc_f32(
|
|||
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
memcpy(
|
||||
|
@ -9448,6 +9511,87 @@ static void ggml_compute_forward_silu_back(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_hardswish_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
assert(dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_hardswish_f32(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
}
|
||||
}
|
||||
static void ggml_compute_forward_hardswish(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_hardswish_f32(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_hardsigmoid_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
assert(dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_hardsigmoid_f32(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_hardsigmoid(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_norm
|
||||
|
||||
static void ggml_compute_forward_norm_f32(
|
||||
|
@ -9940,11 +10084,30 @@ static void ggml_compute_forward_mul_mat(
|
|||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
const int64_t ne_plane = ne01*ne00;
|
||||
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
||||
UNUSED(desired_wsize);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (type != GGML_TYPE_F32) {
|
||||
assert(params->wsize >= desired_wsize);
|
||||
// parallelize by src0 rows
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
// broadcast src0 into src1 across 2nd,3rd dimension
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||
|
||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -9952,9 +10115,14 @@ static void ggml_compute_forward_mul_mat(
|
|||
return;
|
||||
}
|
||||
|
||||
// perform sgemm, parallelization controlled by blas lib
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
//const int64_t tgemm0 = ggml_perf_time_us();
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
// broadcast src0 into src1 across 2nd,3rd dimension
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
|
@ -9963,17 +10131,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
float * const wdata = params->wdata;
|
||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||
|
||||
size_t id = 0;
|
||||
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
||||
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
||||
id += ne00;
|
||||
}
|
||||
|
||||
assert(id*sizeof(float) <= params->wsize);
|
||||
x = wdata;
|
||||
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||
}
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
|
@ -9983,6 +10141,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
||||
|
||||
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||
|
||||
|
@ -9991,6 +10150,9 @@ static void ggml_compute_forward_mul_mat(
|
|||
#endif
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (src1->type != vec_dot_type) {
|
||||
char * wdata = params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
@ -10155,6 +10317,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
char * wdata = params->wdata;
|
||||
if (src1->type != vec_dot_type) {
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
@ -10340,6 +10505,9 @@ static void ggml_compute_forward_out_prod_f32(
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||
return;
|
||||
}
|
||||
|
@ -10523,6 +10691,9 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||
return;
|
||||
}
|
||||
|
@ -10707,6 +10878,9 @@ static void ggml_compute_forward_set_f32(
|
|||
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
memcpy(
|
||||
|
@ -11031,6 +11205,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
|
|||
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
|
@ -11065,6 +11242,9 @@ static void ggml_compute_forward_get_rows_back_f32(
|
|||
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
|
@ -11202,6 +11382,9 @@ static void ggml_compute_forward_diag_mask_f32(
|
|||
GGML_ASSERT(n_past >= 0);
|
||||
|
||||
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||
|
@ -12172,6 +12355,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
||||
|
@ -12266,6 +12452,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
||||
|
@ -12464,6 +12653,7 @@ static void ggml_compute_forward_im2col(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_conv_transpose_2d
|
||||
|
||||
static void ggml_compute_forward_conv_transpose_2d(
|
||||
|
@ -12489,6 +12679,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
||||
|
@ -14228,6 +14421,14 @@ static void ggml_compute_forward_unary(
|
|||
{
|
||||
ggml_compute_forward_silu(params, src0, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
{
|
||||
ggml_compute_forward_hardswish(params, src0, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
{
|
||||
ggml_compute_forward_hardsigmoid(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
@ -14291,6 +14492,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|||
|
||||
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
||||
if (!inplace && params->type == GGML_TASK_INIT) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
||||
return;
|
||||
}
|
||||
|
@ -16589,8 +16793,9 @@ struct ggml_compute_state_shared {
|
|||
const int n_threads;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
atomic_int node_task; // active graph node task phase
|
||||
|
||||
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
||||
void * abort_callback_data;
|
||||
|
@ -16646,6 +16851,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
||||
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
|
@ -16722,7 +16929,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
||||
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
|
@ -16837,6 +17044,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
return n_tasks;
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
|
||||
// wait for other threads to finish
|
||||
const int last_node_n = * node_n;
|
||||
|
||||
while (true) {
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
* node_n = atomic_load(&state->shared->node_n);
|
||||
if (* node_n != last_node_n) break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
||||
// wait for other threads to finish
|
||||
const int last_task_phase = * task_phase;
|
||||
|
||||
while (true) {
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
* task_phase = atomic_load(&state->shared->node_task);
|
||||
if (* task_phase != last_task_phase) break;
|
||||
}
|
||||
}
|
||||
|
||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||
|
||||
|
@ -16847,7 +17082,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
set_numa_thread_affinity(state->ith, n_threads);
|
||||
|
||||
int node_n = -1;
|
||||
int node_n = -1;
|
||||
int task_phase = GGML_TASK_FINALIZE;
|
||||
|
||||
while (true) {
|
||||
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
|
@ -16879,7 +17115,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
// distribute new work or execute it direct if 1T
|
||||
while (++node_n < cgraph->n_nodes) {
|
||||
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
||||
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||
|
||||
|
@ -16888,13 +17123,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
params.nth = n_tasks;
|
||||
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
params.type = GGML_TASK_INIT;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
if (n_tasks == 1) {
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
params.type = GGML_TASK_INIT;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
||||
// they do something more efficient than spinning (?)
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
|
@ -16915,38 +17150,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
}
|
||||
}
|
||||
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_n, node_n);
|
||||
task_phase = GGML_TASK_INIT;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_n, node_n);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
} else {
|
||||
// wait for other threads to finish
|
||||
const int last = node_n;
|
||||
|
||||
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
||||
|
||||
while (true) {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
// UPD: adding the do_yield flag seems to resolve the issue universally
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
node_n = atomic_load(&state->shared->node_n);
|
||||
if (node_n != last) break;
|
||||
};
|
||||
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
||||
}
|
||||
|
||||
// check if we should stop
|
||||
if (node_n >= cgraph->n_nodes) break;
|
||||
|
||||
/* COMPUTE */
|
||||
/* INIT & COMPUTE */
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_COMPUTE,
|
||||
/*.type =*/ GGML_TASK_INIT,
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ n_tasks,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
|
@ -16954,8 +17175,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
};
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
task_phase = GGML_TASK_COMPUTE;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
}
|
||||
else {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
// UPD: adding the do_yield flag seems to resolve the issue universally
|
||||
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
||||
}
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
task_phase = GGML_TASK_FINALIZE;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
}
|
||||
else {
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_EXIT_SUCCESS;
|
||||
|
@ -17012,8 +17264,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
||||
if (node->src[0]->type != GGML_TYPE_F32) {
|
||||
// here we need memory just for single 2D matrix from src0
|
||||
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
||||
// here we need memory for fully dequantized matrix from src0
|
||||
// take into account that src0 can be broadcasted into src1[2,3]
|
||||
cur = ggml_type_size(GGML_TYPE_F32)
|
||||
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
||||
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
|
@ -17173,6 +17428,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||
/*.n_threads =*/ n_threads,
|
||||
/*.n_active =*/ n_threads,
|
||||
/*.node_n =*/ -1,
|
||||
/*.node_task =*/ GGML_TASK_FINALIZE,
|
||||
/*.abort_callback =*/ NULL,
|
||||
/*.abort_callback_data =*/ NULL,
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue