From 2a47e5f05f938598cb68771b9f98a33313f9c17b Mon Sep 17 00:00:00 2001 From: Julia Longtin Date: Sun, 24 Mar 2024 14:18:08 +0000 Subject: [PATCH] separate filling aux16 from consuming aux16 by making it an array of vectors. --- ggml-phi-knc-dot_q5_K_q8_K.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c index 26e03d241..eebd12d89 100644 --- a/ggml-phi-knc-dot_q5_K_q8_K.c +++ b/ggml-phi-knc-dot_q5_K_q8_K.c @@ -147,7 +147,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r float32x16_t sums __attribute__((aligned(128))); int8x16_t aux8[QK_K/16] __attribute__((aligned(32))); - int16x16_t aux16 __attribute__((aligned(64))); + int16x16_t aux16[QK_K/16] __attribute__((aligned(64))); int32x16_t aux32 __attribute__((aligned(128))); GGML_F32x16_VEC_ZERO(&sums); @@ -188,15 +188,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; int is = 0; for (int j = 0; j < QK_K/32; ++j) { - int32_t scale = scales[is++]; - for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l]; - GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32); + for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[j*2])[l] = q8[l] * a[l]; q8 += 16; a += 16; - for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l]; - // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down. - GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32); + for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[(j*2)+1])[l] = q8[l] * a[l]; q8 += 16; a += 16; } + + // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down. + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + GGML_I16x16_S_FMA_I32x16 (&aux16[j*2], scale, &aux32); + GGML_I16x16_S_FMA_I32x16 (&aux16[(j*2)+1], scale, &aux32); + } + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l]; const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;