From f346a41deb2f4fc8be1ba321cb360c06262b511b Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 13 Mar 2024 19:18:10 +0000
Subject: [PATCH 01/52] try to implement one intrinsic

---
 ggml.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 80b987b37..0716e6bd9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -803,7 +803,38 @@ inline static float vaddvq_f32(float32x4_t v) {
 //   number of elements to fit in a single register
 //
 
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#if defined(__k1om__) /* Xeon PHI Knights Corner (IMCI) */
+
+// No, we have an SIMD unit.
+// #define GGML_SIMD
+
+// This SIMD unit can work with 32 float32s at once.
+#define GGML_F32_STEP 32
+// We can fit 16 of these float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+// because we are not defining GGML_SIMD, we have to do this ourself.
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+
+// our vector. 128*32=512
+typedef float32_t float32x16_t __attribute__((vector_size (128)));
+#define GGML_F32x16              float32x16_t
+#define GGML_F32x16_ZERO		      \
+  {					      \
+  __mmask16 mask=0xFFFF;		      \
+  float32x16_t res;			      \
+  asm ("vbroadcastf32x4 [RES] {[M]}, 0[%2]"   \
+       : [RES] "=x"(res)		      \
+       : [M]   "k" mask,		      \
+         [V]   "r" 0.0f)		      \
+  return res;				      \
+  }
+//vdupq_n_f32(0.0f)
+
+#define GGML_F32_VEC        GGML_F32x16
+
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 #define GGML_SIMD
 
@@ -1330,6 +1361,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
+
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
@@ -1362,6 +1394,17 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
     for (int i = np; i < n; ++i) {
         sumf += x[i]*y[i];
     }
+#elif defined(__k1om__)
+    // our result, in the end.
+    float sumf = 0.0f;
+    // the number of vector-sized steps we will need to do.
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+    for (int i = 0; i < 16; ++i) {
+      fprintf(stderr, "boo: %f\n",sum[0]);
+    }
+
 #else
     // scalar
     ggml_float sumf = 0.0;

From a1ae649662d40a38b0520c537782456355f6bc29 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 13 Mar 2024 19:23:53 +0000
Subject: [PATCH 02/52] use right type, and define GGML_F32_VEC_ZERO.

---
 ggml.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 0716e6bd9..e56d7337a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -818,7 +818,7 @@ inline static float vaddvq_f32(float32x4_t v) {
 #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
 
 // our vector. 128*32=512
-typedef float32_t float32x16_t __attribute__((vector_size (128)));
+typedef float float32x16_t __attribute__((vector_size (128)));
 #define GGML_F32x16              float32x16_t
 #define GGML_F32x16_ZERO		      \
   {					      \
@@ -833,6 +833,7 @@ typedef float32_t float32x16_t __attribute__((vector_size (128)));
 //vdupq_n_f32(0.0f)
 
 #define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
 
 #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 

From 7a57feba0cd98c2bfe0f3a4a05a7327145afb506 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 13 Mar 2024 19:26:54 +0000
Subject: [PATCH 03/52] import intrinsics.

---
 ggml.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml.c b/ggml.c
index e56d7337a..009739e27 100644
--- a/ggml.c
+++ b/ggml.c
@@ -41,6 +41,10 @@
 #pragma warning(disable: 4996)
 #endif
 
+#if defined(__k1om__)
+#include <immintrin.h>
+#endif
+
 #if defined(_WIN32)
 
 #include <windows.h>

From 717e164dd7178cff77237bb1d168bc29f32c4b87 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:05:03 +0000
Subject: [PATCH 04/52] implement F32 dot products.

---
 Makefile       |   3 ++
 ggml-phi-knc.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 ggml-phi-knc.h |  16 +++++++
 ggml.c         |  48 ++------------------
 4 files changed, 139 insertions(+), 44 deletions(-)
 create mode 100644 ggml-phi-knc.c
 create mode 100644 ggml-phi-knc.h

diff --git a/Makefile b/Makefile
index d7bd4ed3b..ea27321bf 100644
--- a/Makefile
+++ b/Makefile
@@ -291,6 +291,9 @@ ifeq "${K1OM}" ""
 	# Usage SSSE3-only (Not is SSE3!)
 	#MK_CFLAGS   += -mssse3
 	#MK_CXXFLAGS += -mssse3
+else
+	OBJS         += ggml-phi-knc.o
+	MK_CFLAGS    += -march=knc -mtune=knc
 endif
 
 endif
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
new file mode 100644
index 000000000..ff94104a7
--- /dev/null
+++ b/ggml-phi-knc.c
@@ -0,0 +1,116 @@
+#include <immintrin.h>
+
+#include <stdint.h>
+
+#include <stdio.h>
+
+static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
+{ return (uintptr_t)pointer % byte_count == 0; }
+
+// No, we have an SIMD unit.
+// #define GGML_SIMD
+
+// This SIMD unit can work with 32 float32s at once.
+#define GGML_F32_STEP 32
+// We can fit 16 of these float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+// because we are not defining GGML_SIMD, we have to do this ourself.
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+
+// a single vector. 128*32=512
+typedef float float32x16_t __attribute__((vector_size (128)));
+#define GGML_F32x16              float32x16_t
+
+// from chatGPT. nuke this later.
+#include <string.h>
+
+inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
+{
+  // we only need a mask16, but register sizes...
+  __mmask32 mask=0xFFFFFFFF;
+
+  // FIXME: how do we tell GNU AS to perform upconverts?
+  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
+
+  __asm__ __volatile__ ("movl\t%[M],\t%%eax\n\t"
+			"kmov %%eax,\t%%k1\n\t"
+			"vbroadcastf32x4\t%[Z],\t%%zmm0%{%%k1%}\n\t"
+			"vmovaps\t\t%%zmm0,\t%[RES]%{%%k1%}\n\t"
+                       : [RES]  "+m"  (*target)
+                       : [M]    "m"   (mask),
+                         [Z]    "m"   (zero)
+                       : "eax", "k1", "zmm0");
+}
+
+// multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum.
+inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations)
+{
+  // we only need a mask16, but register sizes...
+  __mmask32 mask=0xFFFFFFFF;
+  __asm__ __volatile__ (
+			"vmovaps\t\t(%[RES]),\t%%zmm0\n\t"          // load our initial state..
+			"1:\n\t"
+			"cmp $0,\t%[ITER]\n\t"                      // Compare iterations to 0
+			"je\t2f\n\t"                                // Jump to label 2 if zero (end of loop)
+			"vmovaps\t\t(%[VEC1]),\t%%zmm1\n\t"         // Load two vectors.
+			"vmovaps\t\t(%[VEC2]),\t%%zmm2\n\t"
+			"vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add.
+			"add $64,\t%[VEC1]\n\t"                     // Move to the next float32x16_t (64 bytes ahead)
+			"add $64,\t%[VEC2]\n\t"
+			"sub $1,\t%[ITER]\n\t"                      // Decrement iterations
+			"jmp 1b\n\t"                                // Jump back to the start of the loop
+			"2: \n\t"                                   // Label for loop end
+			"vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+			: [RES]  "+r" (sumvec),
+			  [ITER] "+r"  (iterations)
+			: [M]     "r"  (mask),
+			  [VEC1]  "r"  (mvec1),
+			  [VEC2]  "r"  (mvec2)
+			: "zmm0", "zmm1", "zmm2", "cc", "memory");
+}
+
+
+// NOTE: all inputs must be __attribute__((aligned(64)));
+float DotProduct_F32(const float * restrict inVec1, const float * restrict inVec2, uint32_t count)
+{
+  // our single result, in the end.
+  float sumf = 0.0f;
+
+  // our sum.
+  float32x16_t sum __attribute__((aligned(64)));
+
+  // the number of vector-sized steps we will need to do.
+  const uint32_t np = (count & ~(GGML_F32_EPR - 1));
+
+  GGML_F32x16_VEC_ZERO(&sum);
+
+  // 0 indexed cycle count
+  //  for (uint32_t cycle = 0; cycle < (np/GGML_F32_EPR); ++cycle)
+  GGML_F32x16_VEC_FMA((float32x16_t *)inVec1, (float32x16_t *)inVec2, &sum, np/GGML_F32_EPR);
+
+  if (count != np)
+    {
+      printf("handling remainder %u\n",count-np);
+      // add the leftovers, that could not be handled by the vector loop.
+      // our extended last part of inVec1.
+      float32x16_t v1 __attribute__((aligned(64)));
+      GGML_F32x16_VEC_ZERO(&v1);
+      // our extended last part of inVec2.
+      float32x16_t v2 __attribute__((aligned(64)));
+      GGML_F32x16_VEC_ZERO(&v2);
+
+      memcpy(&v1, &inVec1[np], (count - np)*sizeof(float));
+      memcpy(&v2, &inVec2[np], (count - np)*sizeof(float));
+
+      GGML_F32x16_VEC_FMA(&v1,
+                         &v2,
+                         &sum, 1);
+    }
+
+  // reduce sum0..sumX to sumf
+  for (uint32_t i=0; i <GGML_F32_EPR; ++i)
+    sumf+=((float *)&sum)[i];
+
+  return sumf;
+}
diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
new file mode 100644
index 000000000..b0f51ee29
--- /dev/null
+++ b/ggml-phi-knc.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+GGML_CALL float DotProduct_F32(const float * restrict vec1, const float * restrict vec2, uint32_t count);
+
+#ifdef  __cplusplus
+}
+#endif
+
diff --git a/ggml.c b/ggml.c
index 009739e27..696b46216 100644
--- a/ggml.c
+++ b/ggml.c
@@ -42,7 +42,7 @@
 #endif
 
 #if defined(__k1om__)
-#include <immintrin.h>
+#include <ggml-phi-knc.h>
 #endif
 
 #if defined(_WIN32)
@@ -808,38 +808,7 @@ inline static float vaddvq_f32(float32x4_t v) {
 //
 
 
-#if defined(__k1om__) /* Xeon PHI Knights Corner (IMCI) */
-
-// No, we have an SIMD unit.
-// #define GGML_SIMD
-
-// This SIMD unit can work with 32 float32s at once.
-#define GGML_F32_STEP 32
-// We can fit 16 of these float32s in a single vector register.
-#define GGML_F32_EPR 16
-
-// because we are not defining GGML_SIMD, we have to do this ourself.
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-
-// our vector. 128*32=512
-typedef float float32x16_t __attribute__((vector_size (128)));
-#define GGML_F32x16              float32x16_t
-#define GGML_F32x16_ZERO		      \
-  {					      \
-  __mmask16 mask=0xFFFF;		      \
-  float32x16_t res;			      \
-  asm ("vbroadcastf32x4 [RES] {[M]}, 0[%2]"   \
-       : [RES] "=x"(res)		      \
-       : [M]   "k" mask,		      \
-         [V]   "r" 0.0f)		      \
-  return res;				      \
-  }
-//vdupq_n_f32(0.0f)
-
-#define GGML_F32_VEC        GGML_F32x16
-#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
-
-#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 #define GGML_SIMD
 
@@ -1374,7 +1343,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
    UNUSED(by);
    UNUSED(bs);
 
-#ifdef GGML_SIMD
+#if defined(GGML_SIMD)
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
 
@@ -1400,16 +1369,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
         sumf += x[i]*y[i];
     }
 #elif defined(__k1om__)
-    // our result, in the end.
-    float sumf = 0.0f;
-    // the number of vector-sized steps we will need to do.
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-    for (int i = 0; i < 16; ++i) {
-      fprintf(stderr, "boo: %f\n",sum[0]);
-    }
-
+    float sumf = DotProduct_F32(x, y, n);
 #else
     // scalar
     ggml_float sumf = 0.0;

From 257ffd99550f9c55e434929e23705d4e964b9d1d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:13:22 +0000
Subject: [PATCH 05/52] Update ggml.c

---
 ggml.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 696b46216..223dfcda1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -807,7 +807,6 @@ inline static float vaddvq_f32(float32x4_t v) {
 //   number of elements to fit in a single register
 //
 
-
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 #define GGML_SIMD

From e216a2f133a3d2583e8dc426d50e2d8e66e4b5c3 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:15:51 +0000
Subject: [PATCH 06/52] Update ggml.c

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 223dfcda1..76530be9f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1342,7 +1342,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
    UNUSED(by);
    UNUSED(bs);
 
-#if defined(GGML_SIMD)
+#ifdef GGML_SIMD
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
 

From eac00a72d512d56260eb4e4aca65c7f936b6273a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:17:21 +0000
Subject: [PATCH 07/52] Update ggml.c

---
 ggml.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 76530be9f..708d1698b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1334,7 +1334,6 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
-
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);

From fe663c1b63600c23b3be25a85101c67dc6300000 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 17 Mar 2024 21:15:32 +0000
Subject: [PATCH 08/52] merge from upstream

---
 Makefile       |  13 ++++
 ggml-phi-knc.c | 166 +++++++++++++++++++++++++++++--------------------
 ggml-phi-knc.h |   5 +-
 ggml.c         |  11 +++-
 4 files changed, 121 insertions(+), 74 deletions(-)

diff --git a/Makefile b/Makefile
index ea27321bf..3dbf3f2f0 100644
--- a/Makefile
+++ b/Makefile
@@ -691,6 +691,9 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 
+# Helper function that replaces .c, .cpp, and .cu file endings with .s:
+GET_ASM_FILE = $(patsubst %.c,%.s,$(patsubst %.cpp,%.s,$(patsubst %.cu,%.s,$(1))))
+
 main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -698,6 +701,16 @@ main: examples/main/main.cpp                                  ggml.o llama.o $(C
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
+bench-phi-knc.s: bench-phi-knc.c
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
+ggml-phi-knc.s: ggml-phi-knc.c
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
+bench-phi-knc: bench-phi-knc.c ggml-phi-knc.o
+	$(CC) $(CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CC) $(CFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index ff94104a7..648f81bcf 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -4,9 +4,6 @@
 
 #include <stdio.h>
 
-static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
-{ return (uintptr_t)pointer % byte_count == 0; }
-
 // No, we have an SIMD unit.
 // #define GGML_SIMD
 
@@ -15,102 +12,135 @@ static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
 // We can fit 16 of these float32s in a single vector register.
 #define GGML_F32_EPR 16
 
-// because we are not defining GGML_SIMD, we have to do this ourself.
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-
 // a single vector. 128*32=512
 typedef float float32x16_t __attribute__((vector_size (128)));
 #define GGML_F32x16              float32x16_t
 
-// from chatGPT. nuke this later.
-#include <string.h>
+// A forward declaration, to keep GCC happy...
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  // we only need a mask16, but register sizes...
-  __mmask32 mask=0xFFFFFFFF;
-
-  // FIXME: how do we tell GNU AS to perform upconverts?
+  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
   float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
 
-  __asm__ __volatile__ ("movl\t%[M],\t%%eax\n\t"
-			"kmov %%eax,\t%%k1\n\t"
-			"vbroadcastf32x4\t%[Z],\t%%zmm0%{%%k1%}\n\t"
-			"vmovaps\t\t%%zmm0,\t%[RES]%{%%k1%}\n\t"
-                       : [RES]  "+m"  (*target)
-                       : [M]    "m"   (mask),
-                         [Z]    "m"   (zero)
-                       : "eax", "k1", "zmm0");
-}
-
-// multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum.
-inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations)
-{
-  // we only need a mask16, but register sizes...
-  __mmask32 mask=0xFFFFFFFF;
   __asm__ __volatile__ (
-			"vmovaps\t\t(%[RES]),\t%%zmm0\n\t"          // load our initial state..
-			"1:\n\t"
-			"cmp $0,\t%[ITER]\n\t"                      // Compare iterations to 0
-			"je\t2f\n\t"                                // Jump to label 2 if zero (end of loop)
-			"vmovaps\t\t(%[VEC1]),\t%%zmm1\n\t"         // Load two vectors.
-			"vmovaps\t\t(%[VEC2]),\t%%zmm2\n\t"
-			"vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add.
-			"add $64,\t%[VEC1]\n\t"                     // Move to the next float32x16_t (64 bytes ahead)
-			"add $64,\t%[VEC2]\n\t"
-			"sub $1,\t%[ITER]\n\t"                      // Decrement iterations
-			"jmp 1b\n\t"                                // Jump back to the start of the loop
-			"2: \n\t"                                   // Label for loop end
-			"vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
-			: [RES]  "+r" (sumvec),
-			  [ITER] "+r"  (iterations)
-			: [M]     "r"  (mask),
-			  [VEC1]  "r"  (mvec1),
-			  [VEC2]  "r"  (mvec2)
-			: "zmm0", "zmm1", "zmm2", "cc", "memory");
+                        "vbroadcastf32x4\t%[Z],\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                       : [RES]  "+m"  (*target)
+                       : [Z]    "m"   (zero)
+                       : "zmm8");
 }
 
-
-// NOTE: all inputs must be __attribute__((aligned(64)));
-float DotProduct_F32(const float * restrict inVec1, const float * restrict inVec2, uint32_t count)
+// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
+inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
 {
-  // our single result, in the end.
-  float sumf = 0.0f;
+  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
+  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
 
+  __asm__ __volatile__ (
+                        "mov\t%[ITER],%%r8\n\t"                     // how many register sized chunks are we responsible for
+                        "mov\t%[VEC1],%%r10\n\t"                    // where do we start work in mvec1?
+                        "mov\t%[VEC2],%%r12\n\t"                    // where do we start work in mvec2?
+                        "cmp\t$1,%[CLR]\n\t"                        // should we clear the sum before we start?
+                        "jne\t4f\n\t"
+                        "vbroadcastf32x4\t%[Z],\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
+                        "vprefetchnta\t(%%r10)\n\t"
+                        "vprefetchnta\t(%%r12)\n\t"
+                        "vprefetch1\t128(%%r10)\n\t"
+                        "vprefetch1\t128(%%r12)\n\t"
+                        "vprefetch1\t256(%%r10)\n\t"
+                        "vprefetch1\t256(%%r12)\n\t"
+                        "vprefetch1\t384(%%r10)\n\t"
+                        "vprefetch1\t384(%%r12)\n\t"
+                        "vprefetch1\t512(%%r10)\n\t"
+                        "vprefetch1\t512(%%r12)\n\t"
+                        "jmp\t1f\n\t"
+                        "4:\n\t"
+                        "vprefetch0\t(%[RES])\n\t"
+                        "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"          // otherwise, load our inital state from sum..
+                        "vprefetchnta\t(%%r10)\n\t"
+                        "vprefetchnta\t(%%r12)\n\t"
+                        "1:\n\t"
+                        "cmp\t$3,\t%%r8\n\t"                        // Compare iterations to three.
+                        "jnae\t6f\n\t"                              // If there are not three iterations left, jump to label 6.
+                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"           // Load two vectors.
+                        "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                        "sub\t$3,\t%%r8\n\t"                        // Decrement iterations
+                        "vprefetchnta\t192(%%r10)\n\t"              // prefetch the next float32x16_t block (192 bytes ahead)
+                        "vprefetchnta\t192(%%r12)\n\t"
+                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"         // Load two vectors.
+                        "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                        "vprefetch1\t320(%%r10)\n\t"                // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
+                        "vprefetch1\t320(%%r12)\n\t"
+                        "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"        // Load two vectors.
+                        "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
+                        "vprefetch1\t576(%%r10)\n\t"
+                        "vprefetch1\t576(%%r12)\n\t"
+                        "vprefetch1\t704(%%r10)\n\t"
+                        "vprefetch1\t704(%%r12)\n\t"
+                        "add\t$192,\t%%r10\n\t"                     // Move to the next float32x16_t block (192 bytes ahead)
+                        "add\t$192,\t%%r12\n\t"
+                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "jmp\t1b\n\t"                               // Jump back to the start of the loop
+                        "6:\n\t"                                    // we know we are near the tail. handle 2, 1, and 0 cases.
+                        "cmp\t$0,\t%%r8\n\t"                        // Compare iterations to zero
+                        "je\t2f\n\t"                                // Jump to label 2 if zero (end of loop)
+                        "cmp\t$1,\t%%r8\n\t"                        // Compare iterations to one
+                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"           // Load two vectors.
+                        "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "je\t2f\n\t"                                // Jump to label 3 if one (end of loop)
+                                                                    // No compare. we must be two.
+                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"         // Load two vectors.
+                        "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "2:\n\t"                                    // Label for loop end
+                        "vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+                        : [RES]  "+r" (sumvec)
+                        : [ITER]  "r"  (iterations),
+                          [VEC1]  "r"  (mvec1),
+                          [VEC2]  "r"  (mvec2),
+                          [CLR]   "r"  (clear),
+                          [Z]     "m"  (zero)
+                        : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
+}
+
+// NOTE: x and y inputs must be __attribute__((aligned(64)));
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc)
+{
   // our sum.
   float32x16_t sum __attribute__((aligned(64)));
 
   // the number of vector-sized steps we will need to do.
-  const uint32_t np = (count & ~(GGML_F32_EPR - 1));
+  const uint32_t np = (n & ~(GGML_F32_EPR - 1));
 
-  GGML_F32x16_VEC_ZERO(&sum);
+  GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1);
 
-  // 0 indexed cycle count
-  //  for (uint32_t cycle = 0; cycle < (np/GGML_F32_EPR); ++cycle)
-  GGML_F32x16_VEC_FMA((float32x16_t *)inVec1, (float32x16_t *)inVec2, &sum, np/GGML_F32_EPR);
-
-  if (count != np)
+  // FIXME: replace this with a final round using masked vectors.
+  if ( n - np != 0 )
     {
-      printf("handling remainder %u\n",count-np);
       // add the leftovers, that could not be handled by the vector loop.
-      // our extended last part of inVec1.
+      // our extended last part of x.
       float32x16_t v1 __attribute__((aligned(64)));
       GGML_F32x16_VEC_ZERO(&v1);
-      // our extended last part of inVec2.
+      // our extended last part of y.
       float32x16_t v2 __attribute__((aligned(64)));
       GGML_F32x16_VEC_ZERO(&v2);
 
-      memcpy(&v1, &inVec1[np], (count - np)*sizeof(float));
-      memcpy(&v2, &inVec2[np], (count - np)*sizeof(float));
+      memcpy(&v1, &x[np], (n - np)*sizeof(float));
+      memcpy(&v2, &y[np], (n - np)*sizeof(float));
 
       GGML_F32x16_VEC_FMA(&v1,
-                         &v2,
-                         &sum, 1);
+                          &v2,
+                          &sum, 1, 0);
+
     }
 
-  // reduce sum0..sumX to sumf
+  // reduce sum, and store it in s.
   for (uint32_t i=0; i <GGML_F32_EPR; ++i)
-    sumf+=((float *)&sum)[i];
+    *s+=((float *)&sum)[i];
 
-  return sumf;
 }
diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
index b0f51ee29..a4b59ae04 100644
--- a/ggml-phi-knc.h
+++ b/ggml-phi-knc.h
@@ -6,11 +6,8 @@
 extern "C" {
 #endif
 
-#include <stdint.h>
-
-GGML_CALL float DotProduct_F32(const float * restrict vec1, const float * restrict vec2, uint32_t count);
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 
 #ifdef  __cplusplus
 }
 #endif
-
diff --git a/ggml.c b/ggml.c
index 708d1698b..7d555c969 100644
--- a/ggml.c
+++ b/ggml.c
@@ -41,6 +41,7 @@
 #pragma warning(disable: 4996)
 #endif
 
+// hand assembled replacement functions are cool.
 #if defined(__k1om__)
 #include <ggml-phi-knc.h>
 #endif
@@ -452,7 +453,11 @@ int64_t ggml_cycles_per_ms(void) {
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
+#if defined(__k1om__)
+// We get this function from elsewhere.
+#else
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+#endif
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
 
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
@@ -1334,6 +1339,9 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
+#if defined(__k1om__)
+// we get this function from elsewhere.
+#else
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
@@ -1366,8 +1374,6 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
     for (int i = np; i < n; ++i) {
         sumf += x[i]*y[i];
     }
-#elif defined(__k1om__)
-    float sumf = DotProduct_F32(x, y, n);
 #else
     // scalar
     ggml_float sumf = 0.0;
@@ -1378,6 +1384,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
 
     *s = sumf;
 }
+#endif
 
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
     assert(nrc == 1);

From f882673ba662dbc44c7733450e70e00847b50e06 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 17 Mar 2024 21:20:14 +0000
Subject: [PATCH 09/52] add a benchmark / test binary.

---
 bench-phi-knc.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 bench-phi-knc.c

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
new file mode 100644
index 000000000..7f5431d87
--- /dev/null
+++ b/bench-phi-knc.c
@@ -0,0 +1,64 @@
+#include <immintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h> /*for CLOCK_REALTIME? */
+
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+
+#include <time.h>
+
+
+#define MAXVEC 1024768
+#define RUNTOTAL 12
+#define RUNS 
+int main(void)
+{
+  struct timespec start, middle, end;
+  double vector_time;
+  double scalar_time;
+  float scalar = 0.0f;
+  float vector = 0.0f;
+  uint32_t vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
+    {
+      // Generate random input vector of [-1, 1] values.
+      float vec1[MAXVEC] __attribute__((aligned(64)));
+      for (int i = 0; i < vecRuns[runCount]; i++)
+        vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
+  
+      // Generate a second random input vector of [-1, 1] values.
+      float vec2[MAXVEC] __attribute__((aligned(64)));
+      for (int i = 0; i < vecRuns[runCount]; i++)
+        vec2[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
+
+      // on your mark..
+      clock_gettime(CLOCK_MONOTONIC, &start);
+
+      // call dot product
+      ggml_vec_dot_f32(vecRuns[runCount], &vector, 0, vec1, 0, vec2, 0, 0);
+
+      // save the middle point..
+      clock_gettime(CLOCK_MONOTONIC, &middle);
+  
+      // do the same work by hand;
+      for (int i = 0; i < vecRuns[runCount]; ++i)
+        scalar += vec1[i]*vec2[i];
+  
+      clock_gettime(CLOCK_MONOTONIC, &end);
+
+      printf("vector\tvs\tscalar (%d items)\n", vector, scalar, vecRuns[runCount]);
+      printf("%.9f\tvs\t%.9f\n", vector, scalar);
+
+      vector_time = middle.tv_sec - start.tv_sec;
+      vector_time += (middle.tv_nsec - start.tv_nsec) / 1000000000.0;
+
+      scalar_time = end.tv_sec - middle.tv_sec;
+      scalar_time += (end.tv_nsec - middle.tv_nsec) / 1000000000.0;
+
+      printf("%.9f\tvs\t%.9f\n", vector_time, scalar_time);
+    }  
+  fflush(stdout);
+  
+  return 0;
+}

From ab6f3a8a8d38a0552d3e7494727c1bc5705bbe94 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 17 Mar 2024 21:36:14 +0000
Subject: [PATCH 10/52] Update ggml-phi-knc.c

---
 ggml-phi-knc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 648f81bcf..c15456781 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -4,6 +4,9 @@
 
 #include <stdio.h>
 
+// For memcpy.
+#include <string.h>
+
 // No, we have an SIMD unit.
 // #define GGML_SIMD
 

From ee27148629ab9f522f17724a08134f7570353eea Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 20 Mar 2024 20:15:16 +0000
Subject: [PATCH 11/52] remove intrinsics import, and use upConv to save 12
 bytes of memory transit.

---
 ggml-phi-knc.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index c15456781..d0b185899 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,5 +1,3 @@
-#include <immintrin.h>
-
 #include <stdint.h>
 
 #include <stdio.h>
@@ -24,11 +22,10 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
 
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
-  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z],\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
                         "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
                        : [RES]  "+m"  (*target)
                        : [Z]    "m"   (zero)
@@ -38,8 +35,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 // Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
 inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
 {
-  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
-  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
                         "mov\t%[ITER],%%r8\n\t"                     // how many register sized chunks are we responsible for
@@ -47,7 +43,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                         "mov\t%[VEC2],%%r12\n\t"                    // where do we start work in mvec2?
                         "cmp\t$1,%[CLR]\n\t"                        // should we clear the sum before we start?
                         "jne\t4f\n\t"
-                        "vbroadcastf32x4\t%[Z],\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
                         "vprefetchnta\t(%%r10)\n\t"
                         "vprefetchnta\t(%%r12)\n\t"
                         "vprefetch1\t128(%%r10)\n\t"

From 76e66e77c2cc2015902a8195e352ea0981679b0e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 20 Mar 2024 21:12:22 +0000
Subject: [PATCH 12/52] use the same header as ggml.c, and remove some
 warnings.

---
 bench-phi-knc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index 7f5431d87..ee96715fb 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -3,11 +3,9 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <unistd.h> /*for CLOCK_REALTIME? */
-
-void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
-
 #include <time.h>
 
+#include "ggml-phi-knc.h"
 
 #define MAXVEC 1024768
 #define RUNTOTAL 12
@@ -19,7 +17,7 @@ int main(void)
   double scalar_time;
   float scalar = 0.0f;
   float vector = 0.0f;
-  uint32_t vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  int vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {
       // Generate random input vector of [-1, 1] values.
@@ -47,7 +45,7 @@ int main(void)
   
       clock_gettime(CLOCK_MONOTONIC, &end);
 
-      printf("vector\tvs\tscalar (%d items)\n", vector, scalar, vecRuns[runCount]);
+      printf("vector\tvs\tscalar (%d items)\n", vecRuns[runCount]);
       printf("%.9f\tvs\t%.9f\n", vector, scalar);
 
       vector_time = middle.tv_sec - start.tv_sec;

From ac3637142d51c9625ab6c44ca9f6d0363ee45d57 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 20 Mar 2024 21:34:12 +0000
Subject: [PATCH 13/52] formatting changes.

---
 ggml-phi-knc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index d0b185899..8001b7a84 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -43,7 +43,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                         "mov\t%[VEC2],%%r12\n\t"                    // where do we start work in mvec2?
                         "cmp\t$1,%[CLR]\n\t"                        // should we clear the sum before we start?
                         "jne\t4f\n\t"
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
                         "vprefetchnta\t(%%r10)\n\t"
                         "vprefetchnta\t(%%r12)\n\t"
                         "vprefetch1\t128(%%r10)\n\t"
@@ -97,7 +97,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                         "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
                         "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
                         "2:\n\t"                                    // Label for loop end
-                        "vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+                        "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"        // save our results.
                         : [RES]  "+r" (sumvec)
                         : [ITER]  "r"  (iterations),
                           [VEC1]  "r"  (mvec1),

From 0979522fbe9ea14bfeb3ff88d56f4ce34ec2f43e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 21 Mar 2024 18:36:25 +0000
Subject: [PATCH 14/52] spacing changes.

---
 bench-phi-knc.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index ee96715fb..5a5da5fe5 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -9,7 +9,7 @@
 
 #define MAXVEC 1024768
 #define RUNTOTAL 12
-#define RUNS 
+#define RUNS
 int main(void)
 {
   struct timespec start, middle, end;
@@ -18,13 +18,14 @@ int main(void)
   float scalar = 0.0f;
   float vector = 0.0f;
   int vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {
       // Generate random input vector of [-1, 1] values.
       float vec1[MAXVEC] __attribute__((aligned(64)));
       for (int i = 0; i < vecRuns[runCount]; i++)
         vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
-  
+
       // Generate a second random input vector of [-1, 1] values.
       float vec2[MAXVEC] __attribute__((aligned(64)));
       for (int i = 0; i < vecRuns[runCount]; i++)
@@ -38,11 +39,11 @@ int main(void)
 
       // save the middle point..
       clock_gettime(CLOCK_MONOTONIC, &middle);
-  
+
       // do the same work by hand;
       for (int i = 0; i < vecRuns[runCount]; ++i)
         scalar += vec1[i]*vec2[i];
-  
+
       clock_gettime(CLOCK_MONOTONIC, &end);
 
       printf("vector\tvs\tscalar (%d items)\n", vecRuns[runCount]);
@@ -55,8 +56,9 @@ int main(void)
       scalar_time += (end.tv_nsec - middle.tv_nsec) / 1000000000.0;
 
       printf("%.9f\tvs\t%.9f\n", vector_time, scalar_time);
-    }  
+    }
+
   fflush(stdout);
-  
+
   return 0;
 }

From 9185e149221f398be3d73aafdcc1dc8b3bd61b87 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 21 Mar 2024 20:38:49 +0000
Subject: [PATCH 15/52] be more specific about the length of our list of run
 amounts.

---
 bench-phi-knc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index 5a5da5fe5..4b7f9d192 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -17,7 +17,7 @@ int main(void)
   double scalar_time;
   float scalar = 0.0f;
   float vector = 0.0f;
-  int vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  int vecRuns[RUNSTOTAL] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
 
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {

From a7bd64c130e455fb5e5377ea30593768744b445a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:19:47 +0000
Subject: [PATCH 16/52] begin work on targeting dot_q5_K_q8_K.

---
 Makefile                     |  2 +-
 ggml-phi-knc-dot_q5_K_q8_K.c | 49 ++++++++++++++++++++++++++++++++++++
 ggml-phi-knc-dot_q5_K_q8_K.h | 14 +++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 ggml-phi-knc-dot_q5_K_q8_K.c
 create mode 100644 ggml-phi-knc-dot_q5_K_q8_K.h

diff --git a/Makefile b/Makefile
index 3dbf3f2f0..42861f4b4 100644
--- a/Makefile
+++ b/Makefile
@@ -292,7 +292,7 @@ ifeq "${K1OM}" ""
 	#MK_CFLAGS   += -mssse3
 	#MK_CXXFLAGS += -mssse3
 else
-	OBJS         += ggml-phi-knc.o
+	OBJS         += ggml-phi-knc.o ggml-phi-knc-dot_q5_K_q8_K.o
 	MK_CFLAGS    += -march=knc -mtune=knc
 endif
 
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
new file mode 100644
index 000000000..9104a939c
--- /dev/null
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -0,0 +1,49 @@
+
+/* A forward declaration, to keep GCC happy. */
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
+
+  const block_q5_K * restrict x = vx;
+  const block_q8_K * restrict y = vy;
+  
+  const int nb = n / QK_K;
+  
+  static const uint32_t kmask1 = 0x3f3f3f3f;
+  static const uint32_t kmask2 = 0x0f0f0f0f;
+  static const uint32_t kmask3 = 0x03030303;
+  
+  uint32_t utmp[4];
+    int8_t aux8[QK_K];
+    int16_t aux16[16];
+    float   sums [8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        int8_t * restrict a = aux8;
+        for (int l = 0; l < 32; ++l) {
+            a[l+ 0] = q4[l] & 0xF;
+            a[l+32] = q4[l]  >> 4;
+        }
+        for (int is = 0; is < 8; ++is) {
+            uint8_t m = 1 << is;
+            for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const int8_t * restrict sc = x[i].scales;
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float dl = d * sc[j];
+            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l <  8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
+            q8 += 16; a += 16;
+        }
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+}
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
new file mode 100644
index 000000000..b416803e0
--- /dev/null
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/* A forward declaration, to keep GCC happy. */
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
+
+#ifdef  __cplusplus
+}
+#endif

From 9bcb8350d57897b516b60f8fed11a4087b0cbb9d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:28:29 +0000
Subject: [PATCH 17/52] import stdint.h for sizeSt.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 ++
 ggml-phi-knc.c               | 1 +
 2 files changed, 3 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 9104a939c..a3ff0143d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -1,3 +1,5 @@
+// For size_t
+#include <stdint.h>
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 8001b7a84..e5e034bb8 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,3 +1,4 @@
+// For size_t
 #include <stdint.h>
 
 #include <stdio.h>

From 8f57803f58aca40227206dad75c9211589c34aa0 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:29:59 +0000
Subject: [PATCH 18/52] import stdio.h for size_t.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 5 ++++-
 ggml-phi-knc.c               | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a3ff0143d..ec571a3fb 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -1,6 +1,9 @@
-// For size_t
+// For uint32_t
 #include <stdint.h>
 
+// For size_t
+#include <stdio.h>
+
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index e5e034bb8..341bbc01b 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,6 +1,6 @@
-// For size_t
 #include <stdint.h>
 
+// For size_t
 #include <stdio.h>
 
 // For memcpy.

From cd20404250038ad5a03f36a45c0e5886fd35186c Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:38:15 +0000
Subject: [PATCH 19/52] pull in ggml specific types.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index ec571a3fb..bfff9112d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -4,6 +4,9 @@
 // For size_t
 #include <stdio.h>
 
+// For block_q5_K and block_q8_K
+#include "ggml-common.h"
+
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 

From 18f353987c4d55c2c578b13073e56a3eb9c7294c Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:49:35 +0000
Subject: [PATCH 20/52] tell ggml-common.h to export what we want.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index bfff9112d..651ad1684 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -4,6 +4,8 @@
 // For size_t
 #include <stdio.h>
 
+// Yes, we have to tell this header to actually export stuff.
+#define GGML_COMMON_IMPL_C
 // For block_q5_K and block_q8_K
 #include "ggml-common.h"
 

From 0b3f17127fe7261d1a8fadba9420d2ffb2d8e53f Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:58:33 +0000
Subject: [PATCH 21/52] force to compile.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 651ad1684..67b9e6025 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -6,7 +6,13 @@
 
 // Yes, we have to tell this header to actually export stuff.
 #define GGML_COMMON_IMPL_C
-// For block_q5_K and block_q8_K
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+
+// FIXME: why do we have to import this twice?
+#define GGML_COMMON_IMPL_C
+// For block_q5_K and block_q8_K. only given the second time.
 #include "ggml-common.h"
 
 /* A forward declaration, to keep GCC happy. */

From 0b012c03efe53524a07d41c56f2270078c30a6a7 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 15:02:56 +0000
Subject: [PATCH 22/52] allow using code from ggml-phi-knc-dot_q5_K_q8_K.c

---
 ggml-quants.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 109dd6660..93e51bb11 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -4,6 +4,7 @@
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 
+// FIXME: why do we import this twice?
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
@@ -49,6 +50,11 @@
 #include <riscv_vector.h>
 #endif
 
+// hand assembled replacement functions are cool.
+#if defined(__k1om__)
+#include <ggml-phi-knc-dot_q5_K_q8_K.h>
+#endif
+
 #undef MIN
 #undef MAX
 
@@ -7094,6 +7100,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 }
 #endif
 
+#if defined(__k1om__)
+/* We get this from elsewhere. */
+#else
 #if QK_K == 256
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
     assert(n % QK_K == 0);
@@ -7518,7 +7527,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 #endif
 }
 
-#else
+#else /* QK_K != 256 */
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
@@ -7787,8 +7796,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 #endif
 }
-#endif
+#endif /* end QK_K != 256 */
 
+#endif /* defined(__k1om__) */
 
 #if QK_K == 256
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {

From 0a2051aa88e8bff9109306df7e7bf57ccced63d2 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 15:55:00 +0000
Subject: [PATCH 23/52] attempt to speed up float clearing.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 91 +++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 32 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 67b9e6025..8e659ede8 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -15,51 +15,78 @@
 // For block_q5_K and block_q8_K. only given the second time.
 #include "ggml-common.h"
 
+
+// This SIMD unit can work with 32 float32s at once.
+#define GGML_F32_STEP 32
+// We can fit 16 of these float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+typedef float float32x8_t __attribute__((vector_size (64)));
+
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
+inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint32_t mask=0x000000FF;
+
+  __asm__ __volatile__ (
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"
+                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
+			: [RES]  "+m"  (*target)
+			: [Z]    "m"   (zero)
+			: [M]    "r"   (mask)
+			: "r9", "zmm8", "k1");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
+  /* interpret X and Y as vectors. */
   const block_q5_K * restrict x = vx;
   const block_q8_K * restrict y = vy;
-  
+
+  /* the number of blocks we will process this in. */
   const int nb = n / QK_K;
-  
+
   static const uint32_t kmask1 = 0x3f3f3f3f;
   static const uint32_t kmask2 = 0x0f0f0f0f;
   static const uint32_t kmask3 = 0x03030303;
-  
+
   uint32_t utmp[4];
-    int8_t aux8[QK_K];
-    int16_t aux16[16];
-    float   sums [8];
-    memset(sums, 0, 8*sizeof(float));
+  int8_t aux8[QK_K];
+  int16_t aux16[16];
+  float32x8_t sums __attribute__((aligned(64)));
 
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 32; ++l) {
-            a[l+ 0] = q4[l] & 0xF;
-            a[l+32] = q4[l]  >> 4;
-        }
-        for (int is = 0; is < 8; ++is) {
-            uint8_t m = 1 << is;
-            for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
-        }
+  /* use a vector operation to clear these floats. */
+  GGML_F32x8_VEC_ZERO(&sums);
 
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict sc = x[i].scales;
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float dl = d * sc[j];
-            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l <  8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
-            q8 += 16; a += 16;
-        }
+  float sumf = 0;
+  for (int i = 0; i < nb; ++i) {
+    const uint8_t * restrict q4 = x[i].qs;
+    const uint8_t * restrict hm = x[i].qh;
+    const  int8_t * restrict q8 = y[i].qs;
+    int8_t * restrict a = aux8;
+    for (int l = 0; l < 32; ++l) {
+      a[l+ 0] = q4[l] & 0xF;
+      a[l+32] = q4[l]  >> 4;
     }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
+    for (int is = 0; is < 8; ++is) {
+      uint8_t m = 1 << is;
+      for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
+    }
+
+    const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+    const int8_t * restrict sc = x[i].scales;
+
+    for (int j = 0; j < QK_K/16; ++j) {
+      const float dl = d * sc[j];
+      for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l <  8; ++l) ((float *)sums)[l] += dl * (aux16[l] + aux16[8+l]);
+      q8 += 16; a += 16;
+    }
+  }
+  for (int l = 0; l < 8; ++l) sumf += ((float *)sums)[l];
+  *s = sumf;
 }

From 6face8a0bebe3e0e24ff2ca8a2b6feae3a2c885d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 15:56:47 +0000
Subject: [PATCH 24/52] first fixes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 8e659ede8..e9ee43844 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -36,8 +36,8 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			"kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
 			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero)
-			: [M]    "r"   (mask)
+			: [Z]    "m"   (zero),
+			  [M]    "r"   (mask)
 			: "r9", "zmm8", "k1");
 }
 
@@ -83,10 +83,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int j = 0; j < QK_K/16; ++j) {
       const float dl = d * sc[j];
       for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)sums)[l] += dl * (aux16[l] + aux16[8+l]);
+      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (aux16[l] + aux16[8+l]);
       q8 += 16; a += 16;
     }
   }
-  for (int l = 0; l < 8; ++l) sumf += ((float *)sums)[l];
+  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From edb76ffddbad7b6269fb0e078a05668bd2deb970 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:19:17 +0000
Subject: [PATCH 25/52] formatting improvement.

---
 ggml-phi-knc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
index a4b59ae04..d2fd11428 100644
--- a/ggml-phi-knc.h
+++ b/ggml-phi-knc.h
@@ -6,7 +6,8 @@
 extern "C" {
 #endif
 
-void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+  /* A forward declaration, to keep GCC happy. */
+  void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 
 #ifdef  __cplusplus
 }

From e3503c924adf2739fa8ebe3ef7f8454427ed8fed Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:21:20 +0000
Subject: [PATCH 26/52] promote aux16 into a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index e9ee43844..251591214 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,6 +22,7 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
+typedef int16 int16x16_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -56,15 +57,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   uint32_t utmp[4];
   int8_t aux8[QK_K];
-  int16_t aux16[16];
+  //  int16_t aux16[16];
+  int16x16_t aux16;
   float32x8_t sums __attribute__((aligned(64)));
 
   /* use a vector operation to clear these floats. */
   GGML_F32x8_VEC_ZERO(&sums);
 
   float sumf = 0;
+
   for (int i = 0; i < nb; ++i) {
+    // quants, 4 low bits.
     const uint8_t * restrict q4 = x[i].qs;
+    // quants, 1 high bit.
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
     int8_t * restrict a = aux8;
@@ -82,8 +87,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     for (int j = 0; j < QK_K/16; ++j) {
       const float dl = d * sc[j];
-      for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (aux16[l] + aux16[8+l]);
+      for (int l = 0; l < 16; ++l) ((int16 *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16 *)&aux16)[l] + ((int16 *)&aux16)[8+l]);
       q8 += 16; a += 16;
     }
   }

From c72157a5a6fe0cfd3721bc5ff9f111c0ae6bbc50 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:24:11 +0000
Subject: [PATCH 27/52] promote aux16 into a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 251591214..7e149f34d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -58,7 +58,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   uint32_t utmp[4];
   int8_t aux8[QK_K];
   //  int16_t aux16[16];
-  int16x16_t aux16;
+  int16x16_t aux16 __attribute__((aligned(64)));
   float32x8_t sums __attribute__((aligned(64)));
 
   /* use a vector operation to clear these floats. */
@@ -87,8 +87,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     for (int j = 0; j < QK_K/16; ++j) {
       const float dl = d * sc[j];
-      for (int l = 0; l < 16; ++l) ((int16 *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16 *)&aux16)[l] + ((int16 *)&aux16)[8+l]);
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16_t *)&aux16)[l] + ((int16_t *)&aux16)[8+l]);
       q8 += 16; a += 16;
     }
   }

From f092a10dc9261d67e9a0c483595f3ad9356d64c7 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:27:11 +0000
Subject: [PATCH 28/52] promote aux16 into a vector. (part three)

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 7e149f34d..acb965c95 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,7 +22,7 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
-typedef int16 int16x16_t __attribute__((vector_size (64)));
+typedef int16_t int16x16_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);

From e43a63e7c622071a9dc481491f6a586263b922fa Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:29:30 +0000
Subject: [PATCH 29/52] fix typo.

---
 bench-phi-knc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index 4b7f9d192..a59e2e5b7 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -17,7 +17,7 @@ int main(void)
   double scalar_time;
   float scalar = 0.0f;
   float vector = 0.0f;
-  int vecRuns[RUNSTOTAL] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  int vecRuns[RUNTOTAL] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
 
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {

From 31d4f9312be9fd551606c65c1ebf8e05d0863a17 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 19:47:21 +0000
Subject: [PATCH 30/52] copy right block.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 76 +++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index acb965c95..cab3b9dc2 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
-  uint32_t mask=0x000000FF;
+  uint32_t mask=0x0000FF00;
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
@@ -55,43 +55,63 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   static const uint32_t kmask2 = 0x0f0f0f0f;
   static const uint32_t kmask3 = 0x03030303;
 
-  uint32_t utmp[4];
-  int8_t aux8[QK_K];
-  //  int16_t aux16[16];
-  int16x16_t aux16 __attribute__((aligned(64)));
-  float32x8_t sums __attribute__((aligned(64)));
+  const uint8_t * scales = (const uint8_t*)&utmp[0];
+  const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  /* use a vector operation to clear these floats. */
-  GGML_F32x8_VEC_ZERO(&sums);
+  int8_t  aux8[QK_K];
+  int16_t aux16[8];
+  float   sums [8];
+  int32_t aux32[8];
+  memset(sums, 0, 8*sizeof(float));
 
   float sumf = 0;
-
   for (int i = 0; i < nb; ++i) {
-    // quants, 4 low bits.
     const uint8_t * restrict q4 = x[i].qs;
-    // quants, 1 high bit.
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
+    memset(aux32, 0, 8*sizeof(int32_t));
     int8_t * restrict a = aux8;
-    for (int l = 0; l < 32; ++l) {
-      a[l+ 0] = q4[l] & 0xF;
-      a[l+32] = q4[l]  >> 4;
+    uint8_t m = 1;
+    for (int j = 0; j < QK_K/64; ++j) {
+      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+      a += 32; m <<= 1;
+      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+      a += 32; m <<= 1;
+      q4 += 32;
     }
-    for (int is = 0; is < 8; ++is) {
-      uint8_t m = 1 << is;
-      for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
-    }
-
-    const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-    const int8_t * restrict sc = x[i].scales;
-
-    for (int j = 0; j < QK_K/16; ++j) {
-      const float dl = d * sc[j];
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16_t *)&aux16)[l] + ((int16_t *)&aux16)[8+l]);
-      q8 += 16; a += 16;
+    memcpy(utmp, x[i].scales, 12);
+    utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+    const uint32_t uaux = utmp[1] & kmask1;
+    utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+    utmp[2] = uaux;
+    utmp[0] &= kmask1;
+    
+    int sumi = 0;
+    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+    a = aux8;
+    int is = 0;
+    for (int j = 0; j < QK_K/32; ++j) {
+      int32_t scale = scales[is++];
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
     }
+    const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+    for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+    sumf -= dmin * sumi;
   }
-  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
+  for (int l = 0; l < 8; ++l) sumf += sums[l];
   *s = sumf;
 }

From f985372e3aa13a4fd4f7b0655281cc09a0f3b446 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 19:49:16 +0000
Subject: [PATCH 31/52] add missing variable.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index cab3b9dc2..668bae93b 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -55,6 +55,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   static const uint32_t kmask2 = 0x0f0f0f0f;
   static const uint32_t kmask3 = 0x03030303;
 
+  uint32_t utmp[4];
+
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 

From bd6d7e6238d1c6647682fb3dc1d8c4f6abe59457 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 19:55:12 +0000
Subject: [PATCH 32/52] try to use vectorized zeroing function.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 668bae93b..68c1aa965 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
-  uint32_t mask=0x0000FF00;
+  uint32_t mask=0x0000000F;
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
@@ -62,9 +62,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   int8_t  aux8[QK_K];
   int16_t aux16[8];
-  float   sums [8];
+  float32x8_t sums;
   int32_t aux32[8];
-  memset(sums, 0, 8*sizeof(float));
+
+  //memset(sums, 0, 8*sizeof(float));
+
+  GGML_F32x8_VEC_ZERO(&sums);
 
   float sumf = 0;
   for (int i = 0; i < nb; ++i) {
@@ -110,10 +113,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * aux32[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  for (int l = 0; l < 8; ++l) sumf += sums[l];
+  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From 9d7ca41703892293073d539bcaa268e888462aca Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 20:48:43 +0000
Subject: [PATCH 33/52] expand mask, and align memory.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 68c1aa965..a9a9c0ae9 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
-  uint32_t mask=0x0000000F;
+  uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
@@ -39,7 +39,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: [RES]  "+m"  (*target)
 			: [Z]    "m"   (zero),
 			  [M]    "r"   (mask)
-			: "r9", "zmm8", "k1");
+			: "zmm8", "k1", memory);
 }
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
@@ -62,11 +62,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   int8_t  aux8[QK_K];
   int16_t aux16[8];
-  float32x8_t sums;
+  float32x8_t sums __attribute__((aligned(64)));
   int32_t aux32[8];
 
-  //memset(sums, 0, 8*sizeof(float));
-
   GGML_F32x8_VEC_ZERO(&sums);
 
   float sumf = 0;

From bb5eb95816d38aab2cb70bb11b3026b4f5181d8e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 20:49:11 +0000
Subject: [PATCH 34/52] use better memory save operator.

---
 ggml-phi-knc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 341bbc01b..e767e2306 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -6,9 +6,6 @@
 // For memcpy.
 #include <string.h>
 
-// No, we have an SIMD unit.
-// #define GGML_SIMD
-
 // This SIMD unit can work with 32 float32s at once.
 #define GGML_F32_STEP 32
 // We can fit 16 of these float32s in a single vector register.
@@ -27,7 +24,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                        "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
                        : [RES]  "+m"  (*target)
                        : [Z]    "m"   (zero)
                        : "zmm8");

From f09b3ed79ebd6d9bf767976f57e9b1caac32b273 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 20:53:16 +0000
Subject: [PATCH 35/52] use quotes properly.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a9a9c0ae9..b4049e9b5 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -39,7 +39,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: [RES]  "+m"  (*target)
 			: [Z]    "m"   (zero),
 			  [M]    "r"   (mask)
-			: "zmm8", "k1", memory);
+			: "zmm8", "k1", "memory");
 }
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {

From 2fdd11fe3a65f043a54e6950257512c34162f6eb Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:00:51 +0000
Subject: [PATCH 36/52] promote aux16 to a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index b4049e9b5..1443398ff 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,7 +22,7 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
-typedef int16_t int16x16_t __attribute__((vector_size (64)));
+typedef int16_t int16x8_t __attribute__((vector_size (32)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -61,7 +61,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
   int8_t  aux8[QK_K];
-  int16_t aux16[8];
+  int16x8_t aux16 __attribute__((aligned(64)));
   float32x8_t sums __attribute__((aligned(64)));
   int32_t aux32[8];
 
@@ -97,17 +97,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From f967690a415ee72efb7d6ea7e7292d084bc5d278 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:05:50 +0000
Subject: [PATCH 37/52] add missing address of operators.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 1443398ff..c10852c57 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -97,17 +97,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From ea1edb0600c746b60bce05cccc0567d832c27725 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:12:35 +0000
Subject: [PATCH 38/52] promote aux32 to a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index c10852c57..dac4b3257 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -23,6 +23,7 @@
 
 typedef float float32x8_t __attribute__((vector_size (64)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
+typedef int32_t int32x8_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -63,7 +64,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   int8_t  aux8[QK_K];
   int16x8_t aux16 __attribute__((aligned(64)));
   float32x8_t sums __attribute__((aligned(64)));
-  int32_t aux32[8];
+  int32x8_t aux32 __attribute__((aligned(64)));
 
   GGML_F32x8_VEC_ZERO(&sums);
 
@@ -98,20 +99,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * aux32[l];
+    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }

From 4477b8e123c960936e9ec31f20d1f1644ca8b176 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:16:23 +0000
Subject: [PATCH 39/52] add I32 vector memory clearing.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index dac4b3257..b2a7f3106 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -43,6 +43,21 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint32_t mask=0x000000FF;
+
+  __asm__ __volatile__ (
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"
+                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
+			: [RES]  "+m"  (*target)
+			: [Z]    "m"   (zero),
+			  [M]    "r"   (mask)
+			: "zmm8", "k1", "memory");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -73,7 +88,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict q4 = x[i].qs;
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
-    memset(aux32, 0, 8*sizeof(int32_t));
+    GGML_I32x8_VEC_ZERO(&aux32);
+
     int8_t * restrict a = aux8;
     uint8_t m = 1;
     for (int j = 0; j < QK_K/64; ++j) {

From a5132a15071280529271f12c9f51aab3d69ae650 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 22:16:57 +0000
Subject: [PATCH 40/52] attempt our first FMA.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index b2a7f3106..adbb55b4b 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -58,6 +58,31 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+// perform an eight wide Fused Multiply Add of an I16x8 times scalar S into I32x8.
+inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint32_t mask=0x000000FF;
+  int32_t scaleVec[4] = {scale, scale, scale, scale};
+
+  __asm__ __volatile__ (
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm1\n\t"        // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm2\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
+			"vmovaps\t\t%[SRC]%{int16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
+			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
+                        "vmovaps\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
+			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
+			"vmovaps\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
+			: [RES]   "+m" (*target)
+			: [Z]     "m"  (zero),
+			  [M]     "r"  (mask),
+			  [SRC]   "m"  (src),
+			  [SCALE] "m"  (scaleVec)
+			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -124,7 +149,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
+      GGML_I16x8_S_FMA_I32x8 (aux16, scale, aux32);
+      //      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From 5935bb34f49ec2e91f55c4f6cd037b4b70a34a49 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 23:46:36 +0000
Subject: [PATCH 41/52] use proper mov operator, and pass addresses.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index adbb55b4b..ab64198c0 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -70,12 +70,12 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
                         "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm1\n\t"        // use an upscaling operator to clear our value.
                         "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm2\n\t"        // use an upscaling operator to clear our value.
 			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-			"vmovaps\t\t%[SRC]%{int16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
+			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
 			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
-                        "vmovaps\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
+                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
 			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-			"vmovaps\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
-			: [RES]   "+m" (*target)
+			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
+			: [RES]   "+m" (*dest)
 			: [Z]     "m"  (zero),
 			  [M]     "r"  (mask),
 			  [SRC]   "m"  (src),
@@ -149,8 +149,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x8 (aux16, scale, aux32);
-      //      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
+      GGML_I16x8_S_FMA_I32x8 (&aux16, scale, &aux32);
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From 03a3e0eb7aab095d02bde39af7ed9217d318ce9d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:04:44 +0000
Subject: [PATCH 42/52] perform 16 operations at a time.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 77 ++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index ab64198c0..37f7cb8fa 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -24,6 +24,8 @@
 typedef float float32x8_t __attribute__((vector_size (64)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
 typedef int32_t int32x8_t __attribute__((vector_size (64)));
+typedef int16_t int16x16_t __attribute__((vector_size (64)));
+typedef int32_t int32x16_t __attribute__((vector_size (128)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -58,6 +60,19 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+inline static void GGML_I32x16_VEC_ZERO(int32x8_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+
+  __asm__ __volatile__ (
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"
+                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
+			: [RES]  "+m"  (*target)
+			: [Z]    "m"   (zero)
+			: "zmm8", "k1", "memory");
+}
+
 // perform an eight wide Fused Multiply Add of an I16x8 times scalar S into I32x8.
 inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
 {
@@ -66,15 +81,12 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // use an upscaling operator to clear our value.
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm1\n\t"        // use an upscaling operator to clear our value.
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm2\n\t"        // use an upscaling operator to clear our value.
 			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
+			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
 			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
-                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
+                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"           // load the item we will be summing onto.
 			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
+			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
 			: [RES]   "+m" (*dest)
 			: [Z]     "m"  (zero),
 			  [M]     "r"  (mask),
@@ -83,6 +95,23 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
 			: "zmm0", "zmm1", "zmm2", "k1", "memory");
 }
 
+// perform an eight wide Fused Multiply Add of an I16x16 times scalar S into I32x16.
+inline static void GGML_I16x16_S_FMA_I32x16 (int16x8_t *src, int32_t scale, int32x8_t *dest)
+{
+  int32_t scaleVec[4] = {scale, scale, scale, scale};
+
+  __asm__ __volatile__ (
+			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
+			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
+                        "vmovdqa32\t\t%[RES],\t%%zmm2\n\t"           // load the item we will be summing onto.
+			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
+			"vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
+			: [RES]   "+m" (*dest)
+			: [SRC]   "m"  (src),
+			  [SCALE] "m"  (scaleVec)
+			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -101,19 +130,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  int8_t  aux8[QK_K];
-  int16x8_t aux16 __attribute__((aligned(64)));
-  float32x8_t sums __attribute__((aligned(64)));
-  int32x8_t aux32 __attribute__((aligned(64)));
+  int8_t aux8[QK_K];
+  int16x16_t aux16 __attribute__((aligned(128)));
+  float32x16_t sums __attribute__((aligned(64)));
+  int32x16_t aux32 __attribute__((aligned(128)));
 
-  GGML_F32x8_VEC_ZERO(&sums);
+  GGML_F32x16_VEC_ZERO(&sums);
 
   float sumf = 0;
   for (int i = 0; i < nb; ++i) {
     const uint8_t * restrict q4 = x[i].qs;
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
-    GGML_I32x8_VEC_ZERO(&aux32);
+
+    GGML_I32x16_VEC_ZERO(&aux32);
 
     int8_t * restrict a = aux8;
     uint8_t m = 1;
@@ -139,24 +169,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
-      q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
-      q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
-      q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x8 (&aux16, scale, &aux32);
-      q8 += 8; a += 8;
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      q8 += 16; a += 16;
+      /* FIXME: while comparing FMA output to normal output, the original had an error. hunt it down. */
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      q8 += 16; a += 16;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
+    for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
+  for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From ba4f4129b362fd16c336ca74d4a3ef4aaffe27a9 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:17:06 +0000
Subject: [PATCH 43/52] better comments, and fix some small errors.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 37f7cb8fa..66f1f1622 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,6 +30,7 @@ typedef int32_t int32x16_t __attribute__((vector_size (128)));
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
+/* clear a vector of 8 floats. */
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
@@ -45,6 +46,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+/* clear a vector of 8 int32_ts. */
 inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
@@ -60,7 +62,8 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
-inline static void GGML_I32x16_VEC_ZERO(int32x8_t *target)
+/* clear a vector of 16 int32_ts. */
+inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
@@ -73,7 +76,7 @@ inline static void GGML_I32x16_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
-// perform an eight wide Fused Multiply Add of an I16x8 times scalar S into I32x8.
+// perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
 inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
@@ -95,8 +98,8 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
 			: "zmm0", "zmm1", "zmm2", "k1", "memory");
 }
 
-// perform an eight wide Fused Multiply Add of an I16x16 times scalar S into I32x16.
-inline static void GGML_I16x16_S_FMA_I32x16 (int16x8_t *src, int32_t scale, int32x8_t *dest)
+// perform a Fused Multiply Add of an I16x16 times scalar S into I32x16.
+inline static void GGML_I16x16_S_FMA_I32x16 (int16x16_t *src, int32_t scale, int32x16_t *dest)
 {
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
@@ -131,8 +134,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
   int8_t aux8[QK_K];
-  int16x16_t aux16 __attribute__((aligned(128)));
-  float32x16_t sums __attribute__((aligned(64)));
+  float32x16_t sums __attribute__((aligned(128)));
+  int16x16_t aux16 __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
 
   GGML_F32x16_VEC_ZERO(&sums);
@@ -143,8 +146,6 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
 
-    GGML_I32x16_VEC_ZERO(&aux32);
-
     int8_t * restrict a = aux8;
     uint8_t m = 1;
     for (int j = 0; j < QK_K/64; ++j) {
@@ -164,17 +165,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     utmp[0] &= kmask1;
     
     int sumi = 0;
+
+    GGML_I32x16_VEC_ZERO(&aux32);
+
     for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     a = aux8;
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
       /* FIXME: while comparing FMA output to normal output, the original had an error. hunt it down. */
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From c28bfe4552de457578e6f83d0111c44a42079230 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:37:47 +0000
Subject: [PATCH 44/52] spacing changes, eliminate dead references to k1 or
 zero, and use the right type when referring to src.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 78 ++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 66f1f1622..a067a8724 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,9 +22,10 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
+typedef float float32x16_t __attribute__((vector_size (128)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
-typedef int32_t int32x8_t __attribute__((vector_size (64)));
 typedef int16_t int16x16_t __attribute__((vector_size (64)));
+typedef int32_t int32x8_t __attribute__((vector_size (64)));
 typedef int32_t int32x16_t __attribute__((vector_size (128)));
 
 /* A forward declaration, to keep GCC happy. */
@@ -37,13 +38,13 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-			"kmov\t%[M],\t%%k1\n\t"
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero),
-			  [M]    "r"   (mask)
-			: "zmm8", "k1", "memory");
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero),
+                          [M]    "r"   (mask)
+                        : "zmm8", "k1", "memory");
 }
 
 /* clear a vector of 8 int32_ts. */
@@ -53,13 +54,13 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-			"kmov\t%[M],\t%%k1\n\t"
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero),
-			  [M]    "r"   (mask)
-			: "zmm8", "k1", "memory");
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero),
+                          [M]    "r"   (mask)
+                        : "zmm8", "k1", "memory");
 }
 
 /* clear a vector of 16 int32_ts. */
@@ -68,12 +69,11 @@ inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-			"kmov\t%[M],\t%%k1\n\t"
-                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero)
-			: "zmm8", "k1", "memory");
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero)
+                        : "zmm8", "memory");
 }
 
 // perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
@@ -84,18 +84,18 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
   __asm__ __volatile__ (
-			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
-			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
+                        "kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
+                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
+                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
                         "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"           // load the item we will be summing onto.
-			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
-			: [RES]   "+m" (*dest)
-			: [Z]     "m"  (zero),
-			  [M]     "r"  (mask),
-			  [SRC]   "m"  (src),
-			  [SCALE] "m"  (scaleVec)
-			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
+                        "vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
+                        : [RES]   "+m" (*dest)
+                        : [Z]     "m"  (zero),
+                          [M]     "r"  (mask),
+                          [SRC]   "m"  (src),
+                          [SCALE] "m"  (scaleVec)
+                        : "zmm0", "zmm1", "zmm2", "k1", "memory");
 }
 
 // perform a Fused Multiply Add of an I16x16 times scalar S into I32x16.
@@ -104,15 +104,15 @@ inline static void GGML_I16x16_S_FMA_I32x16 (int16x16_t *src, int32_t scale, int
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
   __asm__ __volatile__ (
-			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
-			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
+                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
+                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
                         "vmovdqa32\t\t%[RES],\t%%zmm2\n\t"           // load the item we will be summing onto.
-			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
-			"vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
-			: [RES]   "+m" (*dest)
-			: [SRC]   "m"  (src),
-			  [SCALE] "m"  (scaleVec)
-			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
+                        "vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
+                        : [RES]   "+m" (*dest)
+                        : [SRC]   "m"  (*src),
+                          [SCALE] "m"  (scaleVec)
+                        : "zmm0", "zmm1", "zmm2", "memory");
 }
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
@@ -176,8 +176,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
       GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
-      /* FIXME: while comparing FMA output to normal output, the original had an error. hunt it down. */
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
       GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
     }

From 169a1454092ee868d51985d788e6f6f14b8273f1 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:41:21 +0000
Subject: [PATCH 45/52] fix our reference to src in the second place, and use a
 more accurate comment.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a067a8724..8688836b9 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -38,7 +38,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
                         : [RES]  "+m"  (*target)
@@ -54,7 +54,7 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
                         : [RES]  "+m"  (*target)
@@ -69,7 +69,7 @@ inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
                         : [RES]  "+m"  (*target)
                         : [Z]    "m"   (zero)
@@ -93,7 +93,7 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
                         : [RES]   "+m" (*dest)
                         : [Z]     "m"  (zero),
                           [M]     "r"  (mask),
-                          [SRC]   "m"  (src),
+                          [SRC]   "m"  (*src),
                           [SCALE] "m"  (scaleVec)
                         : "zmm0", "zmm1", "zmm2", "k1", "memory");
 }

From cf481cf9017e32508f901c36af761b313cd70938 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:50:01 +0000
Subject: [PATCH 46/52] promote aux8 into a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 8688836b9..66c0f3b58 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -23,6 +23,7 @@
 
 typedef float float32x8_t __attribute__((vector_size (64)));
 typedef float float32x16_t __attribute__((vector_size (128)));
+typedef int8_t int8x16_t __attribute__((vector_size (32)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
 typedef int16_t int16x16_t __attribute__((vector_size (64)));
 typedef int32_t int32x8_t __attribute__((vector_size (64)));
@@ -79,7 +80,6 @@ inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
 // perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
 inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
   uint32_t mask=0x000000FF;
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
@@ -91,8 +91,7 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
                         "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
                         "vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
                         : [RES]   "+m" (*dest)
-                        : [Z]     "m"  (zero),
-                          [M]     "r"  (mask),
+                        : [M]     "r"  (mask),
                           [SRC]   "m"  (*src),
                           [SCALE] "m"  (scaleVec)
                         : "zmm0", "zmm1", "zmm2", "k1", "memory");
@@ -134,6 +133,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
   int8_t aux8[QK_K];
+  int8x16_t aux8x16[QK_K/16] __attribute__((aligned(32)));
   float32x16_t sums __attribute__((aligned(128)));
   int16x16_t aux16 __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
@@ -146,7 +146,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
 
-    int8_t * restrict a = aux8;
+    int8_t * restrict a = aux8_16;
     uint8_t m = 1;
     for (int j = 0; j < QK_K/64; ++j) {
       for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -169,7 +169,6 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     GGML_I32x16_VEC_ZERO(&aux32);
 
     for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-    a = aux8;
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];

From ca0dc26704cf9c7d40e6de691ff462efaeecebca Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 13:35:05 +0000
Subject: [PATCH 47/52] loosen alignment requirements for zeros, add missing
 function, and promote aux8 to an array of vectors.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 66c0f3b58..26e03d241 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -35,7 +35,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 /* clear a vector of 8 floats. */
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
@@ -48,10 +48,23 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
                         : "zmm8", "k1", "memory");
 }
 
+/* clear a vector of 16 floats. */
+inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
+
+  __asm__ __volatile__ (
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
+                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero)
+                        : "zmm8", "memory");
+}
+
 /* clear a vector of 8 int32_ts. */
 inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
@@ -67,7 +80,7 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 /* clear a vector of 16 int32_ts. */
 inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
 
   __asm__ __volatile__ (
                         "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
@@ -132,9 +145,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  int8_t aux8[QK_K];
-  int8x16_t aux8x16[QK_K/16] __attribute__((aligned(32)));
   float32x16_t sums __attribute__((aligned(128)));
+  int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
   int16x16_t aux16 __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
 
@@ -146,8 +158,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
 
-    int8_t * restrict a = aux8_16;
+    int8_t * restrict a = (int8_t * restrict)aux8;
     uint8_t m = 1;
+
+    // Fill the 8 bit vector a with our 5 bit quantization data, 64 blocks at a time.
     for (int j = 0; j < QK_K/64; ++j) {
       for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
       for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
@@ -157,12 +171,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       a += 32; m <<= 1;
       q4 += 32;
     }
+
     memcpy(utmp, x[i].scales, 12);
     utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
     const uint32_t uaux = utmp[1] & kmask1;
     utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
     utmp[2] = uaux;
     utmp[0] &= kmask1;
+
+    a = (int8_t * restrict)aux8;
     
     int sumi = 0;
 

From bc3d6db8624170bc60efab5dca66f16fec06b9d9 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 14:18:08 +0000
Subject: [PATCH 48/52] separate filling aux16 from consuming aux16 by making
 it an array of vectors.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 26e03d241..eebd12d89 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -147,7 +147,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   float32x16_t sums __attribute__((aligned(128)));
   int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
-  int16x16_t aux16 __attribute__((aligned(64)));
+  int16x16_t aux16[QK_K/16] __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
 
   GGML_F32x16_VEC_ZERO(&sums);
@@ -188,15 +188,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
-      int32_t scale = scales[is++];
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[j*2])[l] = q8[l] * a[l];
       q8 += 16; a += 16;
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
-      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[(j*2)+1])[l] = q8[l] * a[l];
       q8 += 16; a += 16;
     }
+
+    // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
+    for (int j = 0; j < QK_K/32; ++j) {
+      int32_t scale = scales[is++];
+      GGML_I16x16_S_FMA_I32x16 (&aux16[j*2], scale, &aux32);
+      GGML_I16x16_S_FMA_I32x16 (&aux16[(j*2)+1], scale, &aux32);
+    }
+
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
     for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;

From 12c9576aeca0a11109f5349baf7bdba377ec4353 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Mon, 25 Mar 2024 19:43:37 +0000
Subject: [PATCH 49/52] fix vector sizes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index eebd12d89..418fa772d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -15,19 +15,18 @@
 // For block_q5_K and block_q8_K. only given the second time.
 #include "ggml-common.h"
 
-
 // This SIMD unit can work with 32 float32s at once.
 #define GGML_F32_STEP 32
 // We can fit 16 of these float32s in a single vector register.
 #define GGML_F32_EPR 16
 
-typedef float float32x8_t __attribute__((vector_size (64)));
-typedef float float32x16_t __attribute__((vector_size (128)));
-typedef int8_t int8x16_t __attribute__((vector_size (32)));
-typedef int16_t int16x8_t __attribute__((vector_size (32)));
-typedef int16_t int16x16_t __attribute__((vector_size (64)));
-typedef int32_t int32x8_t __attribute__((vector_size (64)));
-typedef int32_t int32x16_t __attribute__((vector_size (128)));
+typedef float float32x8_t __attribute__((vector_size (32)));
+typedef float float32x16_t __attribute__((vector_size (64)));
+typedef int8_t int8x16_t __attribute__((vector_size (16)));
+typedef int16_t int16x8_t __attribute__((vector_size (16)));
+typedef int16_t int16x16_t __attribute__((vector_size (32)));
+typedef int32_t int32x8_t __attribute__((vector_size (32)));
+typedef int32_t int32x16_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -145,10 +144,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  float32x16_t sums __attribute__((aligned(128)));
-  int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
-  int16x16_t aux16[QK_K/16] __attribute__((aligned(64)));
-  int32x16_t aux32 __attribute__((aligned(128)));
+  float32x16_t sums __attribute__((aligned(64)));
+  int8x16_t aux8[QK_K/16] __attribute__((aligned(16)));
+  int16x16_t aux16[QK_K/16] __attribute__((aligned(32)));
+  int32x16_t aux32 __attribute__((aligned(64)));
 
   GGML_F32x16_VEC_ZERO(&sums);
 

From 9f569ca50b03dca2b494bf0a641fa68703557d15 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 2 Apr 2024 15:41:56 +0000
Subject: [PATCH 50/52] massively rewrite assembly routines.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 237 ++++++++++++++++++++---------------
 1 file changed, 135 insertions(+), 102 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 418fa772d..1145dfff7 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -20,112 +20,154 @@
 // We can fit 16 of these float32s in a single vector register.
 #define GGML_F32_EPR 16
 
+/* we force an alignment, because i haven't written unaligned forms of the assembly functions, yet.. */
 typedef float float32x8_t __attribute__((vector_size (32)));
-typedef float float32x16_t __attribute__((vector_size (64)));
-typedef int8_t int8x16_t __attribute__((vector_size (16)));
+typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
+typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
+typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
 typedef int16_t int16x8_t __attribute__((vector_size (16)));
 typedef int16_t int16x16_t __attribute__((vector_size (32)));
 typedef int32_t int32x8_t __attribute__((vector_size (32)));
-typedef int32_t int32x16_t __attribute__((vector_size (64)));
+typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
-/* clear a vector of 8 floats. */
-inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
-{
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
-  uint32_t mask=0x000000FF;
-
-  __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                        "kmov\t%[M],\t%%k1\n\t"
-                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-                        : [RES]  "+m"  (*target)
-                        : [Z]    "m"   (zero),
-                          [M]    "r"   (mask)
-                        : "zmm8", "k1", "memory");
-}
-
 /* clear a vector of 16 floats. */
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
+  uint8_t zero=0;
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
+                        "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
                         : [RES]  "+m"  (*target)
                         : [Z]    "m"   (zero)
                         : "zmm8", "memory");
 }
 
-/* clear a vector of 8 int32_ts. */
-inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
+// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16.
+// it loops 8 times. well, actually four, with an unroll.
+inline static void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16 (int8x16_t *src11, uint8x16_t *src21, const uint8_t *scale, int32x16_t *res)
 {
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
-  uint32_t mask=0x000000FF;
+  uint8_t zero = 0;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                        "kmov\t%[M],\t%%k1\n\t"
-                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-                        : [RES]  "+m"  (*target)
-                        : [Z]    "m"   (zero),
-                          [M]    "r"   (mask)
-                        : "zmm8", "k1", "memory");
+			"vprefetche0\t(%[SRC11])\n\t"
+			"vprefetche0\t(%[SRC21])\n\t"
+			"vprefetche0\t(%[SCALE])\n\t"
+			"mov\t$0,\t%%ecx\n\t"
+			"mov\t%[SRC11],\t%%r12\n\t"
+			"mov\t%[SRC21],\t%%r8\n\t"
+			"mov\t%[SCALE],\t%%r9\n\t"
+			"vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"     // empty our result.
+
+			"1:\n\t"
+			"inc\t%%ecx\n\t"                               // we are in our loop, increment our counter.
+			"cmp\t$4,\t%%ecx\n\t"                          // see if this is our last run-through.
+			"vmovdqa32\t\t(%%r12)%{sint8%},\t%%zmm0\n\t"   // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm1\n\t"    // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+			"vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm6\n\t"   // load the item we will be multiplying by.
+			"vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"vmovdqa32\t\t16(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+			"vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"vmovdqa32\t\t32(%%r12)%{sint8%},\t%%zmm8\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm1\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm8,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+			"vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm6\n\t"  // load the item we will be multiplying by.
+			"vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"vmovdqa32\t\t48(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+			"vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"je\t2f\n\t"                                   // if this is the last time through our loop, jump to 2.
+			"vprefetche0\t64(%%r12)\n\t"                   // otherwise, prepare for another run-through.
+			"vprefetche0\t64(%%r8)\n\t"
+			"vprefetche2\t128(%%r12)\n\t"
+			"vprefetche2\t128(%%r8)\n\t"
+			"add\t$64,\t%%r12\n\t"
+			"add\t$64,\t%%r8\n\t"
+			"add\t$2,\t%%r9\n\t"
+			"jmp\t1b\n\t"
+			"2:\n\t"
+			"vmovdqa32\t\t%%zmm7,\t(%[RES])\n\t"           // save the result.
+			: [RES]   "+r" (res)
+			: [SRC11] "r"  (src11),
+			  [SRC21] "r"  (src21),
+			  [SCALE] "r"  (scale),
+			  [Z]     "m"  (zero)
+			: "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "ecx", "r8", "r9", "r12", "memory");
 }
 
-/* clear a vector of 16 int32_ts. */
-inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
+// Unpack 256 unsigned 5 bit values into an 8 bit vector.
+inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
 {
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
+  uint8_t lowmask = 0x0F;
+  uint32_t allmask=0xFFFFFFFF;
+  uint8_t m=1;
+  uint8_t bit5 = 0x10;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
-                        : [RES]  "+m"  (*target)
-                        : [Z]    "m"   (zero)
-                        : "zmm8", "memory");
+			"vprefetche0\t(%[SRC1])\n\t"
+			"vprefetche0\t(%[SRC4])\n\t"
+			"vprefetche1\t64(%[SRC4])\n\t"
+			"mov\t%[SRC4],\t%%r12\n\t"                       // load the address of the head of our 4-bit list.
+			"mov\t%[DST],\t%%r8\n\t"                         // load the address of the head of our destination list.
+			"mov\t$0,%%ecx\n\t"                              // initialize our counter.
+			"vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm6\n\t"     // move 16 packed sets of single bits into the lower 8 bits of zmm6.
+			"vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm7\n\t"   // move the next 16 packed sets of single bits into the lower 8 bits of zmm7.
+			"vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t "   // load our mask.
+			"vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t"     // load the bit we want to add (conditionally).
+			"vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t"       // select which bit we want to test for.
+
+			"1:\n\t"
+			"inc\t%%ecx\n\t"                                 // we are in the loop. increment the counter.
+
+			"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+			"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+			"vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t"     // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+			"vpandd\t%%zmm0,\t%%zmm2,\t%%zmm4\n\t"           // apply a mask, storing the low four bits of vector zmm0 into zmm4.
+			"vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+			"vmovdqa32\t\t16(%%r12)%{uint8%},\t%%zmm1\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+			"vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t"           // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
+			"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+
+			"add\t$32,\t%%r8\n\t"
+			"cmp\t$4,\t%%ecx\n\t"
+			"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+
+			"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+			"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+			"vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t"               // load our even 4 bit sequence into zmm4.
+			"vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+			"vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t"               // load our even 4 bit sequence into zmm5.
+			"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+
+			"je\t2f\n\t"
+
+			"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+			"add\t$32,\t%%r12\n\t"
+			"add\t$32,\t%%r8\n\t"
+			"jmp\t1b\n\t"
+			"2:"
+			: [DST]  "+r" (dst)
+			: [SRC4]  "r" (q4),
+			  [SRC1]  "r" (q1),
+			  [MASK]  "m" (lowmask),
+			  [M]     "m" (m),
+			  [ALL]   "m" (allmask),
+			  [BIT5]  "m" (bit5)
+			: "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "ecx", "k1", "k2", "r12", "r8", "memory"
+			);
 }
-
-// perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
-inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
-{
-  uint32_t mask=0x000000FF;
-  int32_t scaleVec[4] = {scale, scale, scale, scale};
-
-  __asm__ __volatile__ (
-                        "kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
-                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
-                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"           // load the item we will be summing onto.
-                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-                        "vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
-                        : [RES]   "+m" (*dest)
-                        : [M]     "r"  (mask),
-                          [SRC]   "m"  (*src),
-                          [SCALE] "m"  (scaleVec)
-                        : "zmm0", "zmm1", "zmm2", "k1", "memory");
-}
-
-// perform a Fused Multiply Add of an I16x16 times scalar S into I32x16.
-inline static void GGML_I16x16_S_FMA_I32x16 (int16x16_t *src, int32_t scale, int32x16_t *dest)
-{
-  int32_t scaleVec[4] = {scale, scale, scale, scale};
-
-  __asm__ __volatile__ (
-                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
-                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
-                        "vmovdqa32\t\t%[RES],\t%%zmm2\n\t"           // load the item we will be summing onto.
-                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
-                        "vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
-                        : [RES]   "+m" (*dest)
-                        : [SRC]   "m"  (*src),
-                          [SCALE] "m"  (scaleVec)
-                        : "zmm0", "zmm1", "zmm2", "memory");
-}
-
+  
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -144,32 +186,26 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  float32x16_t sums __attribute__((aligned(64)));
-  int8x16_t aux8[QK_K/16] __attribute__((aligned(16)));
-  int16x16_t aux16[QK_K/16] __attribute__((aligned(32)));
-  int32x16_t aux32 __attribute__((aligned(64)));
+  float32x16_t sums;
 
+  // clear sums.
   GGML_F32x16_VEC_ZERO(&sums);
 
   float sumf = 0;
   for (int i = 0; i < nb; ++i) {
-    const uint8_t * restrict q4 = x[i].qs;
-    const uint8_t * restrict hm = x[i].qh;
-    const  int8_t * restrict q8 = y[i].qs;
+    int8x16_t q8copy [QK_K];
+    int32x16_t aux32;
+    uint8x16_t q4copyvec [QK_K/32];
+    uint8x16_t aux8 [QK_K/16];
 
-    int8_t * restrict a = (int8_t * restrict)aux8;
-    uint8_t m = 1;
+    // Fill in our 8 bit vector from y[]. required, because there is no good way to align members of y[], And I haven't mastered unaligned assembly yet...
+    memcpy (q8copy, y[i].qs, QK_K);
 
-    // Fill the 8 bit vector a with our 5 bit quantization data, 64 blocks at a time.
-    for (int j = 0; j < QK_K/64; ++j) {
-      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-      a += 32; m <<= 1;
-      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-      a += 32; m <<= 1;
-      q4 += 32;
-    }
+    // Fill in our 4 bit vector from x[]. required, because there is no good way to align members of x[], And I haven't mastered unaligned assembly yet...
+    memcpy (q4copyvec, x[i].qs, QK_K/2);
+
+    // combine our 4 and 1 bit vector sets into an 8 bit value.
+    GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
 
     memcpy(utmp, x[i].scales, 12);
     utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -194,17 +230,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     }
 
     // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
-    for (int j = 0; j < QK_K/32; ++j) {
-      int32_t scale = scales[is++];
-      GGML_I16x16_S_FMA_I32x16 (&aux16[j*2], scale, &aux32);
-      GGML_I16x16_S_FMA_I32x16 (&aux16[(j*2)+1], scale, &aux32);
-    }
+    GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
     for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
+  
   for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From 8c17353717a297d39edb8cd6b6e7b7a5350f94d3 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 2 Apr 2024 16:55:40 +0000
Subject: [PATCH 51/52] minor changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 1145dfff7..02545057f 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -21,13 +21,9 @@
 #define GGML_F32_EPR 16
 
 /* we force an alignment, because i haven't written unaligned forms of the assembly functions, yet.. */
-typedef float float32x8_t __attribute__((vector_size (32)));
 typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
 typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
 typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
-typedef int16_t int16x8_t __attribute__((vector_size (16)));
-typedef int16_t int16x16_t __attribute__((vector_size (32)));
-typedef int32_t int32x8_t __attribute__((vector_size (32)));
 typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
 
 /* A forward declaration, to keep GCC happy. */
@@ -168,6 +164,8 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
 			);
 }
   
+// A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
+// Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -207,6 +205,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     // combine our 4 and 1 bit vector sets into an 8 bit value.
     GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
 
+    // extract scales and mins..
     memcpy(utmp, x[i].scales, 12);
     utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
     const uint32_t uaux = utmp[1] & kmask1;
@@ -220,24 +219,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     GGML_I32x16_VEC_ZERO(&aux32);
 
-    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-    int is = 0;
-    for (int j = 0; j < QK_K/32; ++j) {
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[j*2])[l] = q8[l] * a[l];
-      q8 += 16; a += 16;
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[(j*2)+1])[l] = q8[l] * a[l];
-      q8 += 16; a += 16;
-    }
-
     // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
     GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
+    int sumi = 0;
+    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
     for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  
+
   for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From 47190a7fe2fe405c4bb1047f950245237d91b46b Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 2 Apr 2024 17:01:53 +0000
Subject: [PATCH 52/52] formatting.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 02545057f..b8262b071 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -166,7 +166,7 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
 // Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
   const block_q5_K * restrict x = vx;