Merge branch 'master' into gg/flash-attn
This commit is contained in:
commit
6875997fd6
84 changed files with 7764 additions and 14303 deletions
223
ggml.c
223
ggml.c
|
@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) {
|
|||
|
||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||
|
||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
||||
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||
|
||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_I8] = {
|
||||
|
@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.is_quantized = false,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
||||
.vec_dot_type = GGML_TYPE_F32,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_F16] = {
|
||||
.type_name = "f16",
|
||||
|
@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0] = {
|
||||
.type_name = "q4_0",
|
||||
|
@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
|
||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
.nrows = 2,
|
||||
#else
|
||||
.nrows = 1,
|
||||
#endif
|
||||
},
|
||||
[GGML_TYPE_Q4_1] = {
|
||||
.type_name = "q4_1",
|
||||
|
@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
|
||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
.nrows = 2,
|
||||
#else
|
||||
.nrows = 1,
|
||||
#endif
|
||||
},
|
||||
[4] = { // GGML_TYPE_Q4_2
|
||||
.type_name = "DEPRECATED",
|
||||
|
@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
},
|
||||
[5] = { // GGML_TYPE_Q4_3
|
||||
.type_name = "DEPRECATED",
|
||||
|
@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q5_0] = {
|
||||
.type_name = "q5_0",
|
||||
|
@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
|
||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q5_1] = {
|
||||
.type_name = "q5_1",
|
||||
|
@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
|
||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q8_0] = {
|
||||
.type_name = "q8_0",
|
||||
|
@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
|
||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
.nrows = 2,
|
||||
#else
|
||||
.nrows = 1,
|
||||
#endif
|
||||
},
|
||||
[GGML_TYPE_Q8_1] = {
|
||||
.type_name = "q8_1",
|
||||
|
@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float = quantize_row_q8_1,
|
||||
.from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.type_name = "q2_K",
|
||||
|
@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
|
||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q3_K] = {
|
||||
.type_name = "q3_K",
|
||||
|
@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
|
||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_K] = {
|
||||
.type_name = "q4_K",
|
||||
|
@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
|
||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q5_K] = {
|
||||
.type_name = "q5_K",
|
||||
|
@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
|
||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q6_K] = {
|
||||
.type_name = "q6_K",
|
||||
|
@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
|
||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ2_XXS] = {
|
||||
.type_name = "iq2_xxs",
|
||||
|
@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ2_XS] = {
|
||||
.type_name = "iq2_xs",
|
||||
|
@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ3_XXS] = {
|
||||
.type_name = "iq3_xxs",
|
||||
|
@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
.type_name = "q8_K",
|
||||
|
@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
|
|||
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
||||
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
||||
|
||||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
|
||||
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
#ifdef GGML_SIMD
|
||||
float sumf = 0.0f;
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest
|
|||
*s = sumf;
|
||||
}
|
||||
|
||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
||||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
|
||||
ggml_float sumf = 0.0;
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
|
@ -1515,7 +1557,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|||
#endif
|
||||
}
|
||||
|
||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
|
||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
||||
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
||||
|
@ -2405,7 +2447,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
#elif defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_init();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
ggml_vk_init();
|
||||
ggml_vk_init_cpu_assist();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
ggml_init_sycl();
|
||||
#endif
|
||||
|
@ -2532,7 +2574,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
|||
size_t max_size = 0;
|
||||
|
||||
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||
max_size = MAX(max_size, ggml_nbytes(tensor));
|
||||
size_t bytes = ggml_nbytes(tensor);
|
||||
max_size = MAX(max_size, bytes);
|
||||
}
|
||||
|
||||
return max_size;
|
||||
|
@ -2668,7 +2711,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||
/*.nb =*/ { 0, 0, 0, 0 },
|
||||
/*.op =*/ GGML_OP_NONE,
|
||||
/*.op_params =*/ { 0 },
|
||||
/*.is_param =*/ false,
|
||||
/*.flags =*/ 0,
|
||||
/*.grad =*/ NULL,
|
||||
/*.src =*/ { NULL },
|
||||
/*.perf_runs =*/ 0,
|
||||
|
@ -6626,7 +6669,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
|
|||
void ggml_set_param(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor) {
|
||||
tensor->is_param = true;
|
||||
tensor->flags |= GGML_TENSOR_FLAG_PARAM;
|
||||
|
||||
GGML_ASSERT(tensor->grad == NULL);
|
||||
tensor->grad = ggml_dup_tensor(ctx, tensor);
|
||||
|
@ -10109,6 +10152,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
|
@ -10276,12 +10320,23 @@ static void ggml_compute_forward_mul_mat(
|
|||
const int64_t blck_0 = 16;
|
||||
const int64_t blck_1 = 16;
|
||||
|
||||
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
||||
int64_t nrc = vec_dot_num_rows;
|
||||
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
||||
// this check can be removed once they are extended to support odd numbered rows/cols too
|
||||
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
||||
nrc = 1;
|
||||
}
|
||||
|
||||
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
||||
|
||||
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||
float tmp[16];
|
||||
// 16 * 2, accounting for mmla kernels
|
||||
float tmp[32];
|
||||
|
||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
|
||||
const int64_t i13 = (ir1/(ne12*ne1));
|
||||
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
||||
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
||||
|
@ -10304,17 +10359,19 @@ static void ggml_compute_forward_mul_mat(
|
|||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||
//}
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
|
||||
}
|
||||
|
||||
for (int cn = 0; cn < nrc; ++cn) {
|
||||
memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||
}
|
||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10503,7 +10560,7 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
//}
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
|
||||
}
|
||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||
}
|
||||
|
@ -11687,7 +11744,7 @@ static void ggml_compute_forward_soft_max_back_f32(
|
|||
|
||||
// linear runtime, no additional memory
|
||||
float dot_y_dy = 0;
|
||||
ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy);
|
||||
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
|
||||
ggml_vec_cpy_f32 (nc, dx, dy);
|
||||
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
|
||||
ggml_vec_mul_f32 (nc, dx, dx, y);
|
||||
|
@ -12007,8 +12064,10 @@ GGML_CALL void ggml_rope_yarn_corr_dims(
|
|||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
||||
dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
|
||||
dims[0] = MAX(0, start);
|
||||
dims[1] = MIN(n_dims - 1, end);
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_rope_f32(
|
||||
|
@ -12486,9 +12545,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|||
const int i1n = i10*ne11;
|
||||
for (int i00 = 0; i00 < ne00; i00++) {
|
||||
float v = 0;
|
||||
ggml_vec_dot_f16(ne02, &v,
|
||||
(ggml_fp16_t *) wdata_src + i1n,
|
||||
(ggml_fp16_t *) wdata_kernel + i00*ne02);
|
||||
ggml_vec_dot_f16(ne02, &v, 0,
|
||||
(ggml_fp16_t *) wdata_src + i1n, 0,
|
||||
(ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1);
|
||||
dst_data[i10*s0 + i00] += v;
|
||||
}
|
||||
}
|
||||
|
@ -12583,9 +12642,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
const int i1n = i10*ne11;
|
||||
for (int i00 = 0; i00 < ne00; i00++) {
|
||||
float v = 0;
|
||||
ggml_vec_dot_f32(ne02, &v,
|
||||
wdata_src + i1n,
|
||||
wdata_kernel + i00*ne02);
|
||||
ggml_vec_dot_f32(ne02, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i00*ne02, 0, 1);
|
||||
dst_data[i10*s0 + i00] += v;
|
||||
}
|
||||
}
|
||||
|
@ -12900,9 +12959,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|||
for (int i01 = 0; i01 < ne01; i01++) {
|
||||
for (int i00 = 0; i00 < ne00; i00++) {
|
||||
float v = 0;
|
||||
ggml_vec_dot_f16(ne03, &v,
|
||||
wdata_src + i1n,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03);
|
||||
ggml_vec_dot_f16(ne03, &v, 0,
|
||||
wdata_src + i1n, 0,
|
||||
wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
|
||||
dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
|
||||
}
|
||||
}
|
||||
|
@ -13331,9 +13390,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
|||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f32(neq0,
|
||||
S + i1,
|
||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
||||
S + i1, 0,
|
||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
}
|
||||
|
||||
// scale
|
||||
|
@ -13416,9 +13475,9 @@ static void ggml_compute_forward_flash_attn_f32(
|
|||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f32(masked_begin,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
||||
S);
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||
S, 0, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -13521,9 +13580,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
|||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f16(neq0,
|
||||
S + i1,
|
||||
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
||||
S + i1, 0,
|
||||
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
}
|
||||
} else {
|
||||
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
||||
|
@ -13625,9 +13684,9 @@ static void ggml_compute_forward_flash_attn_f16(
|
|||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f16(nev0,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
||||
S16);
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||
S16, 0, 1);
|
||||
}
|
||||
} else {
|
||||
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
||||
|
@ -13796,9 +13855,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|||
}
|
||||
|
||||
ggml_vec_dot_f16(D,
|
||||
&s,
|
||||
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
||||
Q16);
|
||||
&s, 0,
|
||||
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
Q16, 0, 1);
|
||||
|
||||
s = s*scale + mv;
|
||||
|
||||
|
@ -13960,9 +14019,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
|||
const int i1 = ib01;
|
||||
|
||||
ggml_vec_dot_f16(nea0,
|
||||
S + i1,
|
||||
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
|
||||
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)));
|
||||
S + i1, 0,
|
||||
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
||||
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
||||
}
|
||||
|
||||
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
||||
|
@ -13985,9 +14044,9 @@ static void ggml_compute_forward_flash_ff_f16(
|
|||
for (int64_t ic = 0; ic < nec01; ++ic) {
|
||||
|
||||
ggml_vec_dot_f16(neb01,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)),
|
||||
S16);
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
||||
S16, 0, 1);
|
||||
}
|
||||
|
||||
ggml_vec_add_f32(nec01,
|
||||
|
@ -14174,9 +14233,9 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f32(neq0,
|
||||
S + i1,
|
||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
||||
S + i1, 0,
|
||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
}
|
||||
|
||||
// scale
|
||||
|
@ -14321,7 +14380,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
|
|||
|
||||
// S = SM * (S - dot(SM, S))
|
||||
float dot_SM_gradSM = 0;
|
||||
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S);
|
||||
ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1);
|
||||
ggml_vec_acc1_f32(M, S, -dot_SM_gradSM);
|
||||
ggml_vec_mul_f32 (masked_begin, S, S, SM);
|
||||
|
||||
|
@ -15158,10 +15217,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
|
||||
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
if (skip_cpu) {
|
||||
ggml_vk_check_results_1(params, tensor);
|
||||
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
||||
}
|
||||
#endif
|
||||
if (skip_cpu) {
|
||||
|
@ -15623,7 +15682,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (node->is_param) {
|
||||
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
return node;
|
||||
}
|
||||
|
||||
|
@ -15657,7 +15716,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
|
|||
|
||||
clone->op = node->op;
|
||||
clone->grad = node->grad;
|
||||
clone->is_param = node->is_param;
|
||||
clone->flags = node->flags;
|
||||
clone->extra = node->extra;
|
||||
for (int k = 0; k < GGML_MAX_DIMS; ++k) {
|
||||
clone->nb[k] = node->nb[k];
|
||||
|
@ -16690,7 +16749,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|||
for (int i = 0; i < gf->n_nodes; i++) {
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
if (node->is_param) {
|
||||
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
|
||||
ggml_build_forward_expand(gb, node->grad);
|
||||
}
|
||||
|
@ -16962,7 +17021,7 @@ struct ggml_compute_state_shared {
|
|||
atomic_int node_n; // active graph node
|
||||
atomic_int node_task; // active graph node task phase
|
||||
|
||||
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
||||
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
|
@ -17589,12 +17648,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
|
||||
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
||||
}
|
||||
ggml_vk_preallocate_buffers();
|
||||
ggml_vk_preallocate_buffers_cpu_assist();
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -17650,7 +17709,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||
}
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
ggml_vk_graph_cleanup();
|
||||
ggml_vk_graph_cleanup_cpu_assist();
|
||||
#endif
|
||||
|
||||
// performance stats (graph)
|
||||
|
@ -18182,7 +18241,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
|
||||
i,
|
||||
node->ne[0], node->ne[1], node->ne[2],
|
||||
ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
||||
ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
|
||||
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
|
||||
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
|
||||
(double) node->perf_time_us / 1000.0,
|
||||
|
@ -18275,7 +18334,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||
continue;
|
||||
}
|
||||
|
||||
if (node->is_param) {
|
||||
if (node->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
snprintf(color, sizeof(color), "yellow");
|
||||
} else if (node->grad) {
|
||||
if (ggml_graph_find(gf, node)) {
|
||||
|
@ -18449,7 +18508,7 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
int np = 0;
|
||||
int64_t nx = 0;
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
if (gf->nodes[i]->is_param) {
|
||||
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
||||
|
||||
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
||||
|
@ -18702,7 +18761,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
}
|
||||
|
||||
// compute the initial gradient in the search direction
|
||||
ggml_vec_dot_f32(nx, &dginit, g, d);
|
||||
ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1);
|
||||
|
||||
// make sure that d points to a descent direction
|
||||
if (0 < dginit) {
|
||||
|
@ -18752,7 +18811,7 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
return count;
|
||||
}
|
||||
|
||||
ggml_vec_dot_f32(nx, &dg, g, d);
|
||||
ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1);
|
||||
|
||||
// check the Wolfe condition
|
||||
if (dg < params->lbfgs.wolfe * dginit) {
|
||||
|
@ -18812,7 +18871,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
int np = 0;
|
||||
int nx = 0;
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
if (gf->nodes[i]->is_param) {
|
||||
if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
|
||||
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
|
||||
|
||||
GGML_ASSERT(np < GGML_MAX_PARAMS);
|
||||
|
@ -19013,8 +19072,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
// ys = y^t \cdot s -> 1 / \rho.
|
||||
// yy = y^t \cdot y.
|
||||
//
|
||||
ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]);
|
||||
ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]);
|
||||
ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1);
|
||||
ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1);
|
||||
|
||||
lm_ys[end[0]] = ys;
|
||||
|
||||
|
@ -19033,7 +19092,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
for (int i = 0; i < bound; ++i) {
|
||||
j[0] = (j[0] + m - 1) % m;
|
||||
// \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
|
||||
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d);
|
||||
ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1);
|
||||
lm_alpha[j[0]] /= lm_ys[j[0]];
|
||||
// q_{i} = q_{i+1} - \alpha_{i} y_{i}
|
||||
ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]);
|
||||
|
@ -19043,7 +19102,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
|
||||
for (int i = 0; i < bound; ++i) {
|
||||
// \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
|
||||
ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d);
|
||||
ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1);
|
||||
beta /= lm_ys[j[0]];
|
||||
// \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
|
||||
ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta);
|
||||
|
@ -19287,6 +19346,16 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ggml_set_input(struct ggml_tensor * tensor) {
|
||||
tensor->flags |= GGML_TENSOR_FLAG_INPUT;
|
||||
}
|
||||
|
||||
void ggml_set_output(struct ggml_tensor * tensor) {
|
||||
tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ggml_quantize_init(enum ggml_type type) {
|
||||
ggml_critical_section_start();
|
||||
|
||||
|
@ -20931,4 +21000,12 @@ int ggml_cpu_has_vsx(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_matmul_int8(void) {
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue