From cb96a48ed123518665ddcf1fa9a06f47bccf5512 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 16:14:28 +0000
Subject: [PATCH] perform better prefetches, and invert the test of our clear
 flag for clarity.

---
 ggml-phi-knc.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index a273c9525..9d7a34199 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -38,11 +38,20 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
     uint8_t zero = 0;
 
     __asm__ __volatile__ (
-                          "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
-                          "mov\t%[VEC1],%%r10\n\t"                      // where do we start work in mvec1?
-                          "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
-                          "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
-                          "jne\t4f\n\t"
+                          "vprefetchenta\t(%[RES])\n\t"
+                          "vprefetch0\t(%[VEC1])\n\t"
+                          "vprefetch1\t64(%[VEC1])\n\t"
+                          "vprefetch0\t128(%[VEC1])\n\t"
+                          "vprefetch1\t192(%[VEC1])\n\t"
+                          "vprefetch0\t(%[VEC2])\n\t"
+                          "vprefetch1\t64(%[VEC2])\n\t"
+                          "vprefetch0\t128(%[VEC2])\n\t"
+                          "vprefetch1\t192(%[VEC2])\n\t"
+                          "mov\t%[ITER],%%r8\n\t"                       // How many vector sized chunks we are responsible for.
+                          "mov\t%[VEC1],%%r10\n\t"                      // Where do we start work in mvec1?
+                          "mov\t%[VEC2],%%r12\n\t"                      // Where do we start work in mvec2?
+                          "cmp\t$0,%[CLR]\n\t"                          // Should we clear the sum before we start?
+                          "jz\t4f\n\t"
                           "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // if so, use an upscaling operator to do it.
                           "vprefetchnta\t(%%r10)\n\t"
                           "vprefetchnta\t(%%r12)\n\t"