From 7a00422fa36379e482153b55ec226dcc4f37cbe3 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 19:55:12 +0000
Subject: [PATCH] try to use vectorized zeroing function.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 668bae93b..68c1aa965 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
-  uint32_t mask=0x0000FF00;
+  uint32_t mask=0x0000000F;
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
@@ -62,9 +62,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   int8_t  aux8[QK_K];
   int16_t aux16[8];
-  float   sums [8];
+  float32x8_t sums;
   int32_t aux32[8];
-  memset(sums, 0, 8*sizeof(float));
+
+  //memset(sums, 0, 8*sizeof(float));
+
+  GGML_F32x8_VEC_ZERO(&sums);
 
   float sumf = 0;
   for (int i = 0; i < nb; ++i) {
@@ -110,10 +113,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * aux32[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  for (int l = 0; l < 8; ++l) sumf += sums[l];
+  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }