diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c index 095241cda..9f9cf1f0d 100644 --- a/ggml-phi-knc.c +++ b/ggml-phi-knc.c @@ -107,6 +107,27 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12"); } +// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. uses masks to handle just the last run-through. +inline static void GGML_F32x16_VEC_FMA_TAIL(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t items) +{ + uint32_t mask = (0x00000001 << items)-1; + + __asm__ __volatile__ ( + "vprefetchnta\t(%[VEC1])\n\t" + "vprefetchnta\t(%[VEC2])\n\t" + "vmovaps\t\t(%[RES]),\t%%zmm0\n\t" // Load our inital state from sum.. + "kmov\t%[MASK],%%k1\n\t" // Load a mask that we will use to just operate on part of a vector.. + "vmovaps\t\t(%[VEC1]),\t%%zmm1%{%%k1%}\n\t" // Partially two vectors. + "vmovaps\t\t(%[VEC2]),\t%%zmm2%{%%k1%}\n\t" + "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0%{%%k1%}\n\t" // Perform a fused multiply add + "vmovnraps\t\t%%zmm0,\t(%[RES])%{%%k1%}\n\t" // save our results. + : [RES] "+r" (sumvec) + : [VEC1] "r" (mvec1), + [VEC2] "r" (mvec2), + [MASK] "r" (mask) + : "zmm0", "zmm1", "zmm2", "k1", "memory"); +} + // NOTE: x and y inputs must be __attribute__((aligned(64))); void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { @@ -118,26 +139,11 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1); - // add the leftovers, that could not be handled by the vector loop. - if ( n - np != 0 ) - { - // our extended last part of x. - float32x16_t v1; - GGML_F32x16_VEC_ZERO(&v1); - // our extended last part of y. - float32x16_t v2; - GGML_F32x16_VEC_ZERO(&v2); - - memcpy(&v1, &x[np], (n - np)*sizeof(float)); - memcpy(&v2, &y[np], (n - np)*sizeof(float)); - - GGML_F32x16_VEC_FMA(&v1, - &v2, - &sum, 1, 0); - } + // add the leftovers, that could not be handled by the whole vector loop. + if ( n - np != 0 ) GGML_F32x16_VEC_FMA_TAIL((const float32x16_t *)&x[np], (const float32x16_t *)&y[np], &sum, n-np); // reduce sum, and store it in s. - for (uint32_t i=0; i