diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c index 67b9e6025..8e659ede8 100644 --- a/ggml-phi-knc-dot_q5_K_q8_K.c +++ b/ggml-phi-knc-dot_q5_K_q8_K.c @@ -15,51 +15,78 @@ // For block_q5_K and block_q8_K. only given the second time. #include "ggml-common.h" + +// This SIMD unit can work with 32 float32s at once. +#define GGML_F32_STEP 32 +// We can fit 16 of these float32s in a single vector register. +#define GGML_F32_EPR 16 + +typedef float float32x8_t __attribute__((vector_size (64))); + /* A forward declaration, to keep GCC happy. */ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc); +inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target) +{ + uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0}; + uint32_t mask=0x000000FF; + + __asm__ __volatile__ ( + "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value. + "kmov\t%[M],\t%%k1\n\t" + "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t" + : [RES] "+m" (*target) + : [Z] "m" (zero) + : [M] "r" (mask) + : "r9", "zmm8", "k1"); +} + void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + /* interpret X and Y as vectors. */ const block_q5_K * restrict x = vx; const block_q8_K * restrict y = vy; - + + /* the number of blocks we will process this in. */ const int nb = n / QK_K; - + static const uint32_t kmask1 = 0x3f3f3f3f; static const uint32_t kmask2 = 0x0f0f0f0f; static const uint32_t kmask3 = 0x03030303; - + uint32_t utmp[4]; - int8_t aux8[QK_K]; - int16_t aux16[16]; - float sums [8]; - memset(sums, 0, 8*sizeof(float)); + int8_t aux8[QK_K]; + int16_t aux16[16]; + float32x8_t sums __attribute__((aligned(64))); - float sumf = 0; - for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; - for (int l = 0; l < 32; ++l) { - a[l+ 0] = q4[l] & 0xF; - a[l+32] = q4[l] >> 4; - } - for (int is = 0; is < 8; ++is) { - uint8_t m = 1 << is; - for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); - } + /* use a vector operation to clear these floats. */ + GGML_F32x8_VEC_ZERO(&sums); - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict sc = x[i].scales; - - for (int j = 0; j < QK_K/16; ++j) { - const float dl = d * sc[j]; - for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; - for (int l = 0; l < 8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]); - q8 += 16; a += 16; - } + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const uint8_t * restrict q4 = x[i].qs; + const uint8_t * restrict hm = x[i].qh; + const int8_t * restrict q8 = y[i].qs; + int8_t * restrict a = aux8; + for (int l = 0; l < 32; ++l) { + a[l+ 0] = q4[l] & 0xF; + a[l+32] = q4[l] >> 4; } - for (int l = 0; l < 8; ++l) sumf += sums[l]; - *s = sumf; + for (int is = 0; is < 8; ++is) { + uint8_t m = 1 << is; + for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16); + } + + const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const int8_t * restrict sc = x[i].scales; + + for (int j = 0; j < QK_K/16; ++j) { + const float dl = d * sc[j]; + for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) ((float *)sums)[l] += dl * (aux16[l] + aux16[8+l]); + q8 += 16; a += 16; + } + } + for (int l = 0; l < 8; ++l) sumf += ((float *)sums)[l]; + *s = sumf; }