diff --git a/ggml.c b/ggml.c index 7fe1576b3..3c2792f8d 100644 --- a/ggml.c +++ b/ggml.c @@ -195,8 +195,8 @@ typedef void * thread_ret_t; #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else -inline static void* ggml_aligned_malloc(size_t size) { - void* aligned_memory = NULL; +inline static void * ggml_aligned_malloc(size_t size) { + void * aligned_memory = NULL; #ifdef GGML_USE_METAL int result = posix_memalign(&aligned_memory, getpagesize(), size); #else @@ -3812,7 +3812,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); +static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3884,7 +3884,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59"); +static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4111,7 +4111,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) { // // is enough, but just in case, adding the second part - return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]); + return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN); } size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { @@ -4254,7 +4254,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { +bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -4603,7 +4603,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, - /*.op_params =*/ {0}, + /*.op_params =*/ { 0 }, /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, @@ -4635,6 +4635,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings assert(params_size <= GGML_MAX_OP_PARAMS); memcpy(tensor->op_params, params, params_size); } @@ -6440,7 +6441,7 @@ struct ggml_tensor * ggml_permute( result->src[0] = a; int32_t params[] = { axis0, axis1, axis2, axis3 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); return result; } @@ -6566,7 +6567,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, inplace ? 1 : 0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6606,7 +6607,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, inplace ? 1 : 0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_ZERO; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6722,9 +6723,9 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); int32_t params[6] = { n_past, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6798,7 +6799,7 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); int32_t params[] = { n_past, n_dims, mode, n_ctx }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6829,7 +6830,7 @@ struct ggml_tensor * ggml_alibi( int32_t op_params[3] = { n_past, n_head }; memcpy(op_params + 2, &bias_max, sizeof(float)); - ggml_set_op_params(result, &op_params, sizeof(op_params)); + ggml_set_op_params(result, op_params, sizeof(op_params)); result->op = GGML_OP_ALIBI; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6856,7 +6857,7 @@ struct ggml_tensor * ggml_clamp( struct ggml_tensor * result = ggml_view_tensor(ctx, a); float params[] = { min, max }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CLAMP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6891,10 +6892,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d( ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), a->ne[2], 1, 1, }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { s0, p0, d0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6906,10 +6907,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d( // ggml_conv_2d -struct ggml_tensor* ggml_conv_2d( - struct ggml_context* ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, int s0, int s1, int p0, @@ -6930,10 +6931,10 @@ struct ggml_tensor* ggml_conv_2d( ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), a->ne[3], b->ne[3], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6946,7 +6947,7 @@ struct ggml_tensor* ggml_conv_2d( // ggml_conv_1d_ph -struct ggml_tensor* ggml_conv_1d_ph( +struct ggml_tensor * ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -6964,7 +6965,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { // ggml_pool_1d -struct ggml_tensor* ggml_pool_1d( +struct ggml_tensor * ggml_pool_1d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -6983,10 +6984,10 @@ struct ggml_tensor* ggml_pool_1d( ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); int32_t params[] = { op, k0, s0, p0 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6997,7 +6998,7 @@ struct ggml_tensor* ggml_pool_1d( // ggml_pool_2d -struct ggml_tensor* ggml_pool_2d( +struct ggml_tensor * ggml_pool_2d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -7020,10 +7021,10 @@ struct ggml_tensor* ggml_pool_2d( ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), a->ne[2], }; - struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7191,7 +7192,7 @@ struct ggml_tensor * ggml_win_part( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { npx, npy, w }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_WIN_PART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7221,7 +7222,7 @@ struct ggml_tensor * ggml_win_unpart( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); int32_t params[] = { w }; - ggml_set_op_params(result, ¶ms, sizeof(params)); + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_WIN_UNPART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -7350,7 +7351,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } -// ggml_map_custom1 +// ggml_map_custom1_f32 static struct ggml_tensor * ggml_map_custom1_impl_f32( struct ggml_context * ctx, @@ -7367,7 +7368,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM1; + result->op = GGML_OP_MAP_CUSTOM1_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; @@ -7388,7 +7389,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32( return ggml_map_custom1_impl_f32(ctx, a, fun, true); } -// ggml_map_custom2 +// ggml_map_custom2_f32 static struct ggml_tensor * ggml_map_custom2_impl_f32( struct ggml_context * ctx, @@ -7406,7 +7407,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM2; + result->op = GGML_OP_MAP_CUSTOM2_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7430,7 +7431,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32( return ggml_map_custom2_impl_f32(ctx, a, b, fun, true); } -// ggml_map_custom3 +// ggml_map_custom3_f32 static struct ggml_tensor * ggml_map_custom3_impl_f32( struct ggml_context * ctx, @@ -7449,7 +7450,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32( ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = GGML_OP_MAP_CUSTOM3; + result->op = GGML_OP_MAP_CUSTOM3_F32; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7476,6 +7477,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32( return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); } +// ggml_map_custom1 +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom1_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true); +} + +// ggml_map_custom2 + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom2_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom2_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true); +} + +// ggml_map_custom3 + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom3_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad || c->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom3_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM3; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); +} + + + // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( @@ -9284,8 +9469,8 @@ static void ggml_compute_forward_sum_rows_f32( for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i1 = 0; i1 < ne01; i1++) { - float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); - float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); float row_sum = 0; ggml_vec_sum_f32(ne00, &row_sum, src_row); dst_row[0] = row_sum; @@ -10542,72 +10727,96 @@ static void ggml_compute_forward_mul_mat( return; } - // parallelize by src0 rows - const int64_t dr = (ne01 + nth - 1)/nth; - - const int64_t ir10 = dr*ith; - const int64_t ir11 = MIN(ir10 + dr, ne01); - - // src1 rows - const int64_t nr1 = ne11*ne12*ne13; - const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; - for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { - const int64_t i13 = (ir1/(ne12*ne11)); - const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; - const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne11*ne12*ne13; // src1 rows - const int64_t ir0 = (ir1/ne11)%(ne02*ne03); - const int64_t i03 = (ir0/(ne02)); - // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. - // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: - // GG: this is likely the correct way to broadcast, though need some more thought - // therefore leaving the comments to remind us for now - const int64_t i02 = (i12 / (ne12 / ne02)); - // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) - // const int64_t i02 = (ir0 - i03*ne02); + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; + // distribute the thread work across the inner or outer loop based on which one is larger - const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows - // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides - // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using - // the original src1 data pointer, so we should index using the indices directly - // TODO: this is a bit of a hack, we should probably have a better way to handle this - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size - : (i11*nb11 + i12*nb12 + i13*nb13)); + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; - for (int64_t ir = ir10; ir < ir11; ++ir) { - vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col); - } + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + return; } - //int64_t t1 = ggml_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } } - // ggml_compute_forward_out_prod - static void ggml_compute_forward_out_prod_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -12890,7 +13099,7 @@ static void ggml_compute_forward_pool_1d( const struct ggml_tensor * src0, struct ggml_tensor * dst) { - const int32_t* opts = (const int32_t*)dst->op_params; + const int32_t * opts = (const int32_t *)dst->op_params; enum ggml_op_pool op = opts[0]; const int k0 = opts[1]; const int s0 = opts[2]; @@ -14223,24 +14432,6 @@ static void ggml_compute_forward_map_custom1_f32( fun(dst, a); } - -static void ggml_compute_forward_map_custom1( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, - struct ggml_tensor * dst, - const ggml_custom1_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom1_f32(params, a, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_map_custom2 static void ggml_compute_forward_map_custom2_f32( @@ -14259,24 +14450,6 @@ static void ggml_compute_forward_map_custom2_f32( } -static void ggml_compute_forward_map_custom2( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b, - struct ggml_tensor * dst, - const ggml_custom2_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3_f32( @@ -14295,24 +14468,52 @@ static void ggml_compute_forward_map_custom3_f32( fun(dst, a, b, c); } +// ggml_compute_forward_map_custom1 + +static void ggml_compute_forward_map_custom1( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params; + + p->fun(dst, a, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params; + + p->fun(dst, a, b, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3( const struct ggml_compute_params * params, const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, - struct ggml_tensor * dst, - const ggml_custom3_op_f32_t fun) { - switch (a->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun); - } break; - default: - { - GGML_ASSERT(false); - } break; + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; } + + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params; + + p->fun(dst, a, b, c, params->ith, params->nth, p->userdata); } // ggml_compute_forward_cross_entropy_loss @@ -14834,25 +15035,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); } break; - case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM1_F32: { ggml_custom1_op_f32_t fun; memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); + ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM2_F32: + { + ggml_custom2_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM3_F32: + { + ggml_custom3_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1: + { + ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); } break; case GGML_OP_MAP_CUSTOM2: { - ggml_custom2_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun); + ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MAP_CUSTOM3: { - ggml_custom3_op_f32_t fun; - memcpy(&fun, tensor->op_params, sizeof(fun)); - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS: @@ -15660,6 +15876,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: case GGML_OP_MAP_CUSTOM1: case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM3: @@ -16449,12 +16668,39 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_WIN_UNPART: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1: - case GGML_OP_MAP_CUSTOM2: - case GGML_OP_MAP_CUSTOM3: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: { n_tasks = 1; } break; + case GGML_OP_MAP_CUSTOM1: + { + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM2: + { + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM3: + { + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; case GGML_OP_CROSS_ENTROPY_LOSS: { n_tasks = n_threads; diff --git a/ggml.h b/ggml.h index aba92480c..bdbd12800 100644 --- a/ggml.h +++ b/ggml.h @@ -183,6 +183,15 @@ # define GGML_API #endif +// TODO: support for clang +#ifdef __GNUC__ +# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define GGML_DEPRECATED(func, hint) func +#endif + #include #include #include @@ -374,6 +383,10 @@ extern "C" { GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, + GGML_OP_MAP_CUSTOM1_F32, + GGML_OP_MAP_CUSTOM2_F32, + GGML_OP_MAP_CUSTOM3_F32, + GGML_OP_MAP_CUSTOM1, GGML_OP_MAP_CUSTOM2, GGML_OP_MAP_CUSTOM3, @@ -570,6 +583,8 @@ extern "C" { GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); @@ -1240,7 +1255,7 @@ extern "C" { // conv_1d with padding = half // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) - GGML_API struct ggml_tensor* ggml_conv_1d_ph( + GGML_API struct ggml_tensor * ggml_conv_1d_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -1253,7 +1268,7 @@ extern "C" { GGML_OP_POOL_COUNT, }; - GGML_API struct ggml_tensor* ggml_pool_1d( + GGML_API struct ggml_tensor * ggml_pool_1d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -1261,7 +1276,7 @@ extern "C" { int s0, // stride int p0); // padding - GGML_API struct ggml_tensor* ggml_pool_2d( + GGML_API struct ggml_tensor * ggml_pool_2d( struct ggml_context * ctx, struct ggml_tensor * a, enum ggml_op_pool op, @@ -1315,15 +1330,6 @@ extern "C" { int h0, int w); - // custom operators - - typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); - typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); - - typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); - GGML_API struct ggml_tensor * ggml_unary( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1334,63 +1340,138 @@ extern "C" { struct ggml_tensor * a, enum ggml_unary_op op); - GGML_API struct ggml_tensor * ggml_map_unary_f32( + // custom operators + + typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); + typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + + typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_unary_op_f32_t fun); + ggml_unary_op_f32_t fun), + "use ggml_map_custom1 instead"); - GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_unary_op_f32_t fun); + ggml_unary_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_binary_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_binary_op_f32_t fun); + ggml_binary_op_f32_t fun), + "use ggml_map_custom2 instead"); - GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_binary_op_f32_t fun); + ggml_binary_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom1_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_custom1_op_f32_t fun); + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1 instead"); - GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, - ggml_custom1_op_f32_t fun); + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom2_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_custom2_op_f32_t fun); + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2 instead"); - GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - ggml_custom2_op_f32_t fun); + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); - GGML_API struct ggml_tensor * ggml_map_custom3_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c, - ggml_custom3_op_f32_t fun); + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3 instead"); - GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c, - ggml_custom3_op_f32_t fun); + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3_inplace instead"); + + // custom operators v2 + + typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); + typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + + #define GGML_N_TASKS_MAX -1 + + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); // loss function