From 868a2016ac2ee4b9b60adcf2dc154008a7e95551 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 12 Mar 2024 20:57:43 +0000
Subject: [PATCH 001/105] add detection of Xeon PHI: Knights Corner.

---
 ggml.c    | 8 ++++++++
 ggml.h    | 1 +
 llama.cpp | 1 +
 3 files changed, 10 insertions(+)

diff --git a/ggml.c b/ggml.c
index 9a7bd1d8c..d7cfe3a26 100644
--- a/ggml.c
+++ b/ggml.c
@@ -21590,4 +21590,12 @@ int ggml_cpu_has_matmul_int8(void) {
 #endif
 }
 
+int ggml_cpu_is_xeonphi_knc(void) {
+#if defined(__k1om__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml.h b/ggml.h
index 1171088a9..0024bbc7a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2358,6 +2358,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_sycl       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
+    GGML_API int ggml_cpu_is_xeonphi_knc (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
diff --git a/llama.cpp b/llama.cpp
index ad7b7b7d4..2b0ee2922 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14229,6 +14229,7 @@ const char * llama_print_system_info(void) {
     s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
     s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
+    s += "XEONPHI_KNC = " + std::to_string(ggml_cpu_is_xeonphi_knc())  + " | ";
 
     return s.c_str();
 }

From 7f3722beb6c43b12731a7a753b3984cde01f9d73 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 12 Mar 2024 21:02:14 +0000
Subject: [PATCH 002/105] handle the case that we have no glibc on the PHI.

---
 ggml.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index d7cfe3a26..eb094b85f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2154,6 +2154,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
     // figure out which node we're on
     uint current_cpu;
     int getcpu_ret = 0;
+#if defined(__GLIBC__)
 #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
     getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
@@ -2163,7 +2164,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
 #   endif
     getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
 #endif
-
+#endif
     if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
         g_state.numa.n_nodes = 0;
         return;

From 5a2973af25c9960538a00304cad22aa6c0f626e7 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 12 Mar 2024 21:07:10 +0000
Subject: [PATCH 003/105] instead of checking on glibc, check on SYS_getcpu

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index eb094b85f..80b987b37 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2154,7 +2154,6 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
     // figure out which node we're on
     uint current_cpu;
     int getcpu_ret = 0;
-#if defined(__GLIBC__)
 #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
     getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
@@ -2162,6 +2161,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
 #   if !defined(SYS_getcpu) && defined(SYS_get_cpu)
 #       define SYS_getcpu SYS_get_cpu // some older glibc versions use this name
 #   endif
+#   if defined(SYS_getcpu)
     getcpu_ret = syscall(SYS_getcpu, &current_cpu, &g_state.numa.current_node);
 #endif
 #endif

From a31c936c5afe0e447e0c183ab398c780d37bb928 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 12 Mar 2024 21:40:46 +0000
Subject: [PATCH 004/105] try to detect the PHI cross compiler in make.

---
 Makefile | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Makefile b/Makefile
index c8fd3f5c5..6146d406d 100644
--- a/Makefile
+++ b/Makefile
@@ -92,6 +92,8 @@ CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
 endif
 
+K1OM := $( shell echo | $CC -dM -E - | grep __k1om__ )
+
 #
 # Compile flags
 #
@@ -274,6 +276,9 @@ endif
 ifndef RISCV
 
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
+
+ifeq "$(K1OM)" ""
+
 	# Use all CPU extensions that are available:
 	MK_CFLAGS     += -march=native -mtune=native
 	HOST_CXXFLAGS += -march=native -mtune=native
@@ -287,6 +292,8 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#MK_CXXFLAGS += -mssse3
 endif
 
+endif
+
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412

From aec982eefd47e0c0130e9c9f885a7897e7a7f685 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 12 Mar 2024 21:54:38 +0000
Subject: [PATCH 005/105] try to detect the PHI cross compiler in make.

---
 Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 6146d406d..d7bd4ed3b 100644
--- a/Makefile
+++ b/Makefile
@@ -92,7 +92,7 @@ CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
 endif
 
-K1OM := $( shell echo | $CC -dM -E - | grep __k1om__ )
+K1OM := $(shell echo | $(CC) -dM -E - | grep __k1om__)
 
 #
 # Compile flags
@@ -277,7 +277,8 @@ ifndef RISCV
 
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 
-ifeq "$(K1OM)" ""
+# detect the PHI cross compiler.
+ifeq "${K1OM}" ""
 
 	# Use all CPU extensions that are available:
 	MK_CFLAGS     += -march=native -mtune=native

From f346a41deb2f4fc8be1ba321cb360c06262b511b Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 13 Mar 2024 19:18:10 +0000
Subject: [PATCH 006/105] try to implement one intrinsic

---
 ggml.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 80b987b37..0716e6bd9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -803,7 +803,38 @@ inline static float vaddvq_f32(float32x4_t v) {
 //   number of elements to fit in a single register
 //
 
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#if defined(__k1om__) /* Xeon PHI Knights Corner (IMCI) */
+
+// No, we have an SIMD unit.
+// #define GGML_SIMD
+
+// This SIMD unit can work with 32 float32s at once.
+#define GGML_F32_STEP 32
+// We can fit 16 of these float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+// because we are not defining GGML_SIMD, we have to do this ourself.
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+
+// our vector. 128*32=512
+typedef float32_t float32x16_t __attribute__((vector_size (128)));
+#define GGML_F32x16              float32x16_t
+#define GGML_F32x16_ZERO		      \
+  {					      \
+  __mmask16 mask=0xFFFF;		      \
+  float32x16_t res;			      \
+  asm ("vbroadcastf32x4 [RES] {[M]}, 0[%2]"   \
+       : [RES] "=x"(res)		      \
+       : [M]   "k" mask,		      \
+         [V]   "r" 0.0f)		      \
+  return res;				      \
+  }
+//vdupq_n_f32(0.0f)
+
+#define GGML_F32_VEC        GGML_F32x16
+
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 #define GGML_SIMD
 
@@ -1330,6 +1361,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
+
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
@@ -1362,6 +1394,17 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
     for (int i = np; i < n; ++i) {
         sumf += x[i]*y[i];
     }
+#elif defined(__k1om__)
+    // our result, in the end.
+    float sumf = 0.0f;
+    // the number of vector-sized steps we will need to do.
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+    for (int i = 0; i < 16; ++i) {
+      fprintf(stderr, "boo: %f\n",sum[0]);
+    }
+
 #else
     // scalar
     ggml_float sumf = 0.0;

From a1ae649662d40a38b0520c537782456355f6bc29 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 13 Mar 2024 19:23:53 +0000
Subject: [PATCH 007/105] use right type, and define GGML_F32_VEC_ZERO.

---
 ggml.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 0716e6bd9..e56d7337a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -818,7 +818,7 @@ inline static float vaddvq_f32(float32x4_t v) {
 #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
 
 // our vector. 128*32=512
-typedef float32_t float32x16_t __attribute__((vector_size (128)));
+typedef float float32x16_t __attribute__((vector_size (128)));
 #define GGML_F32x16              float32x16_t
 #define GGML_F32x16_ZERO		      \
   {					      \
@@ -833,6 +833,7 @@ typedef float32_t float32x16_t __attribute__((vector_size (128)));
 //vdupq_n_f32(0.0f)
 
 #define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
 
 #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 

From 7a57feba0cd98c2bfe0f3a4a05a7327145afb506 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 13 Mar 2024 19:26:54 +0000
Subject: [PATCH 008/105] import intrinsics.

---
 ggml.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml.c b/ggml.c
index e56d7337a..009739e27 100644
--- a/ggml.c
+++ b/ggml.c
@@ -41,6 +41,10 @@
 #pragma warning(disable: 4996)
 #endif
 
+#if defined(__k1om__)
+#include <immintrin.h>
+#endif
+
 #if defined(_WIN32)
 
 #include <windows.h>

From 717e164dd7178cff77237bb1d168bc29f32c4b87 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:05:03 +0000
Subject: [PATCH 009/105] implement F32 dot products.

---
 Makefile       |   3 ++
 ggml-phi-knc.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 ggml-phi-knc.h |  16 +++++++
 ggml.c         |  48 ++------------------
 4 files changed, 139 insertions(+), 44 deletions(-)
 create mode 100644 ggml-phi-knc.c
 create mode 100644 ggml-phi-knc.h

diff --git a/Makefile b/Makefile
index d7bd4ed3b..ea27321bf 100644
--- a/Makefile
+++ b/Makefile
@@ -291,6 +291,9 @@ ifeq "${K1OM}" ""
 	# Usage SSSE3-only (Not is SSE3!)
 	#MK_CFLAGS   += -mssse3
 	#MK_CXXFLAGS += -mssse3
+else
+	OBJS         += ggml-phi-knc.o
+	MK_CFLAGS    += -march=knc -mtune=knc
 endif
 
 endif
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
new file mode 100644
index 000000000..ff94104a7
--- /dev/null
+++ b/ggml-phi-knc.c
@@ -0,0 +1,116 @@
+#include <immintrin.h>
+
+#include <stdint.h>
+
+#include <stdio.h>
+
+static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
+{ return (uintptr_t)pointer % byte_count == 0; }
+
+// No, we have an SIMD unit.
+// #define GGML_SIMD
+
+// This SIMD unit can work with 32 float32s at once.
+#define GGML_F32_STEP 32
+// We can fit 16 of these float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+// because we are not defining GGML_SIMD, we have to do this ourself.
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+
+// a single vector. 128*32=512
+typedef float float32x16_t __attribute__((vector_size (128)));
+#define GGML_F32x16              float32x16_t
+
+// from chatGPT. nuke this later.
+#include <string.h>
+
+inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
+{
+  // we only need a mask16, but register sizes...
+  __mmask32 mask=0xFFFFFFFF;
+
+  // FIXME: how do we tell GNU AS to perform upconverts?
+  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
+
+  __asm__ __volatile__ ("movl\t%[M],\t%%eax\n\t"
+			"kmov %%eax,\t%%k1\n\t"
+			"vbroadcastf32x4\t%[Z],\t%%zmm0%{%%k1%}\n\t"
+			"vmovaps\t\t%%zmm0,\t%[RES]%{%%k1%}\n\t"
+                       : [RES]  "+m"  (*target)
+                       : [M]    "m"   (mask),
+                         [Z]    "m"   (zero)
+                       : "eax", "k1", "zmm0");
+}
+
+// multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum.
+inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations)
+{
+  // we only need a mask16, but register sizes...
+  __mmask32 mask=0xFFFFFFFF;
+  __asm__ __volatile__ (
+			"vmovaps\t\t(%[RES]),\t%%zmm0\n\t"          // load our initial state..
+			"1:\n\t"
+			"cmp $0,\t%[ITER]\n\t"                      // Compare iterations to 0
+			"je\t2f\n\t"                                // Jump to label 2 if zero (end of loop)
+			"vmovaps\t\t(%[VEC1]),\t%%zmm1\n\t"         // Load two vectors.
+			"vmovaps\t\t(%[VEC2]),\t%%zmm2\n\t"
+			"vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add.
+			"add $64,\t%[VEC1]\n\t"                     // Move to the next float32x16_t (64 bytes ahead)
+			"add $64,\t%[VEC2]\n\t"
+			"sub $1,\t%[ITER]\n\t"                      // Decrement iterations
+			"jmp 1b\n\t"                                // Jump back to the start of the loop
+			"2: \n\t"                                   // Label for loop end
+			"vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+			: [RES]  "+r" (sumvec),
+			  [ITER] "+r"  (iterations)
+			: [M]     "r"  (mask),
+			  [VEC1]  "r"  (mvec1),
+			  [VEC2]  "r"  (mvec2)
+			: "zmm0", "zmm1", "zmm2", "cc", "memory");
+}
+
+
+// NOTE: all inputs must be __attribute__((aligned(64)));
+float DotProduct_F32(const float * restrict inVec1, const float * restrict inVec2, uint32_t count)
+{
+  // our single result, in the end.
+  float sumf = 0.0f;
+
+  // our sum.
+  float32x16_t sum __attribute__((aligned(64)));
+
+  // the number of vector-sized steps we will need to do.
+  const uint32_t np = (count & ~(GGML_F32_EPR - 1));
+
+  GGML_F32x16_VEC_ZERO(&sum);
+
+  // 0 indexed cycle count
+  //  for (uint32_t cycle = 0; cycle < (np/GGML_F32_EPR); ++cycle)
+  GGML_F32x16_VEC_FMA((float32x16_t *)inVec1, (float32x16_t *)inVec2, &sum, np/GGML_F32_EPR);
+
+  if (count != np)
+    {
+      printf("handling remainder %u\n",count-np);
+      // add the leftovers, that could not be handled by the vector loop.
+      // our extended last part of inVec1.
+      float32x16_t v1 __attribute__((aligned(64)));
+      GGML_F32x16_VEC_ZERO(&v1);
+      // our extended last part of inVec2.
+      float32x16_t v2 __attribute__((aligned(64)));
+      GGML_F32x16_VEC_ZERO(&v2);
+
+      memcpy(&v1, &inVec1[np], (count - np)*sizeof(float));
+      memcpy(&v2, &inVec2[np], (count - np)*sizeof(float));
+
+      GGML_F32x16_VEC_FMA(&v1,
+                         &v2,
+                         &sum, 1);
+    }
+
+  // reduce sum0..sumX to sumf
+  for (uint32_t i=0; i <GGML_F32_EPR; ++i)
+    sumf+=((float *)&sum)[i];
+
+  return sumf;
+}
diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
new file mode 100644
index 000000000..b0f51ee29
--- /dev/null
+++ b/ggml-phi-knc.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+GGML_CALL float DotProduct_F32(const float * restrict vec1, const float * restrict vec2, uint32_t count);
+
+#ifdef  __cplusplus
+}
+#endif
+
diff --git a/ggml.c b/ggml.c
index 009739e27..696b46216 100644
--- a/ggml.c
+++ b/ggml.c
@@ -42,7 +42,7 @@
 #endif
 
 #if defined(__k1om__)
-#include <immintrin.h>
+#include <ggml-phi-knc.h>
 #endif
 
 #if defined(_WIN32)
@@ -808,38 +808,7 @@ inline static float vaddvq_f32(float32x4_t v) {
 //
 
 
-#if defined(__k1om__) /* Xeon PHI Knights Corner (IMCI) */
-
-// No, we have an SIMD unit.
-// #define GGML_SIMD
-
-// This SIMD unit can work with 32 float32s at once.
-#define GGML_F32_STEP 32
-// We can fit 16 of these float32s in a single vector register.
-#define GGML_F32_EPR 16
-
-// because we are not defining GGML_SIMD, we have to do this ourself.
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-
-// our vector. 128*32=512
-typedef float float32x16_t __attribute__((vector_size (128)));
-#define GGML_F32x16              float32x16_t
-#define GGML_F32x16_ZERO		      \
-  {					      \
-  __mmask16 mask=0xFFFF;		      \
-  float32x16_t res;			      \
-  asm ("vbroadcastf32x4 [RES] {[M]}, 0[%2]"   \
-       : [RES] "=x"(res)		      \
-       : [M]   "k" mask,		      \
-         [V]   "r" 0.0f)		      \
-  return res;				      \
-  }
-//vdupq_n_f32(0.0f)
-
-#define GGML_F32_VEC        GGML_F32x16
-#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
-
-#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 #define GGML_SIMD
 
@@ -1374,7 +1343,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
    UNUSED(by);
    UNUSED(bs);
 
-#ifdef GGML_SIMD
+#if defined(GGML_SIMD)
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
 
@@ -1400,16 +1369,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
         sumf += x[i]*y[i];
     }
 #elif defined(__k1om__)
-    // our result, in the end.
-    float sumf = 0.0f;
-    // the number of vector-sized steps we will need to do.
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-    for (int i = 0; i < 16; ++i) {
-      fprintf(stderr, "boo: %f\n",sum[0]);
-    }
-
+    float sumf = DotProduct_F32(x, y, n);
 #else
     // scalar
     ggml_float sumf = 0.0;

From 257ffd99550f9c55e434929e23705d4e964b9d1d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:13:22 +0000
Subject: [PATCH 010/105] Update ggml.c

---
 ggml.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 696b46216..223dfcda1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -807,7 +807,6 @@ inline static float vaddvq_f32(float32x4_t v) {
 //   number of elements to fit in a single register
 //
 
-
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 
 #define GGML_SIMD

From e216a2f133a3d2583e8dc426d50e2d8e66e4b5c3 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:15:51 +0000
Subject: [PATCH 011/105] Update ggml.c

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 223dfcda1..76530be9f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1342,7 +1342,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
    UNUSED(by);
    UNUSED(bs);
 
-#if defined(GGML_SIMD)
+#ifdef GGML_SIMD
     float sumf = 0.0f;
     const int np = (n & ~(GGML_F32_STEP - 1));
 

From eac00a72d512d56260eb4e4aca65c7f936b6273a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 16 Mar 2024 14:17:21 +0000
Subject: [PATCH 012/105] Update ggml.c

---
 ggml.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 76530be9f..708d1698b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1334,7 +1334,6 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
-
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);

From fe663c1b63600c23b3be25a85101c67dc6300000 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 17 Mar 2024 21:15:32 +0000
Subject: [PATCH 013/105] merge from upstream

---
 Makefile       |  13 ++++
 ggml-phi-knc.c | 166 +++++++++++++++++++++++++++++--------------------
 ggml-phi-knc.h |   5 +-
 ggml.c         |  11 +++-
 4 files changed, 121 insertions(+), 74 deletions(-)

diff --git a/Makefile b/Makefile
index ea27321bf..3dbf3f2f0 100644
--- a/Makefile
+++ b/Makefile
@@ -691,6 +691,9 @@ clean:
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 
+# Helper function that replaces .c, .cpp, and .cu file endings with .s:
+GET_ASM_FILE = $(patsubst %.c,%.s,$(patsubst %.cpp,%.s,$(patsubst %.cu,%.s,$(1))))
+
 main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -698,6 +701,16 @@ main: examples/main/main.cpp                                  ggml.o llama.o $(C
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
+bench-phi-knc.s: bench-phi-knc.c
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
+ggml-phi-knc.s: ggml-phi-knc.c
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
+bench-phi-knc: bench-phi-knc.c ggml-phi-knc.o
+	$(CC) $(CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CC) $(CFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index ff94104a7..648f81bcf 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -4,9 +4,6 @@
 
 #include <stdio.h>
 
-static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
-{ return (uintptr_t)pointer % byte_count == 0; }
-
 // No, we have an SIMD unit.
 // #define GGML_SIMD
 
@@ -15,102 +12,135 @@ static inline _Bool is_aligned(const void *restrict pointer, size_t byte_count)
 // We can fit 16 of these float32s in a single vector register.
 #define GGML_F32_EPR 16
 
-// because we are not defining GGML_SIMD, we have to do this ourself.
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-
 // a single vector. 128*32=512
 typedef float float32x16_t __attribute__((vector_size (128)));
 #define GGML_F32x16              float32x16_t
 
-// from chatGPT. nuke this later.
-#include <string.h>
+// A forward declaration, to keep GCC happy...
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  // we only need a mask16, but register sizes...
-  __mmask32 mask=0xFFFFFFFF;
-
-  // FIXME: how do we tell GNU AS to perform upconverts?
+  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
   float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
 
-  __asm__ __volatile__ ("movl\t%[M],\t%%eax\n\t"
-			"kmov %%eax,\t%%k1\n\t"
-			"vbroadcastf32x4\t%[Z],\t%%zmm0%{%%k1%}\n\t"
-			"vmovaps\t\t%%zmm0,\t%[RES]%{%%k1%}\n\t"
-                       : [RES]  "+m"  (*target)
-                       : [M]    "m"   (mask),
-                         [Z]    "m"   (zero)
-                       : "eax", "k1", "zmm0");
-}
-
-// multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum.
-inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations)
-{
-  // we only need a mask16, but register sizes...
-  __mmask32 mask=0xFFFFFFFF;
   __asm__ __volatile__ (
-			"vmovaps\t\t(%[RES]),\t%%zmm0\n\t"          // load our initial state..
-			"1:\n\t"
-			"cmp $0,\t%[ITER]\n\t"                      // Compare iterations to 0
-			"je\t2f\n\t"                                // Jump to label 2 if zero (end of loop)
-			"vmovaps\t\t(%[VEC1]),\t%%zmm1\n\t"         // Load two vectors.
-			"vmovaps\t\t(%[VEC2]),\t%%zmm2\n\t"
-			"vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add.
-			"add $64,\t%[VEC1]\n\t"                     // Move to the next float32x16_t (64 bytes ahead)
-			"add $64,\t%[VEC2]\n\t"
-			"sub $1,\t%[ITER]\n\t"                      // Decrement iterations
-			"jmp 1b\n\t"                                // Jump back to the start of the loop
-			"2: \n\t"                                   // Label for loop end
-			"vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
-			: [RES]  "+r" (sumvec),
-			  [ITER] "+r"  (iterations)
-			: [M]     "r"  (mask),
-			  [VEC1]  "r"  (mvec1),
-			  [VEC2]  "r"  (mvec2)
-			: "zmm0", "zmm1", "zmm2", "cc", "memory");
+                        "vbroadcastf32x4\t%[Z],\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                       : [RES]  "+m"  (*target)
+                       : [Z]    "m"   (zero)
+                       : "zmm8");
 }
 
-
-// NOTE: all inputs must be __attribute__((aligned(64)));
-float DotProduct_F32(const float * restrict inVec1, const float * restrict inVec2, uint32_t count)
+// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
+inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
 {
-  // our single result, in the end.
-  float sumf = 0.0f;
+  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
+  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
 
+  __asm__ __volatile__ (
+                        "mov\t%[ITER],%%r8\n\t"                     // how many register sized chunks are we responsible for
+                        "mov\t%[VEC1],%%r10\n\t"                    // where do we start work in mvec1?
+                        "mov\t%[VEC2],%%r12\n\t"                    // where do we start work in mvec2?
+                        "cmp\t$1,%[CLR]\n\t"                        // should we clear the sum before we start?
+                        "jne\t4f\n\t"
+                        "vbroadcastf32x4\t%[Z],\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
+                        "vprefetchnta\t(%%r10)\n\t"
+                        "vprefetchnta\t(%%r12)\n\t"
+                        "vprefetch1\t128(%%r10)\n\t"
+                        "vprefetch1\t128(%%r12)\n\t"
+                        "vprefetch1\t256(%%r10)\n\t"
+                        "vprefetch1\t256(%%r12)\n\t"
+                        "vprefetch1\t384(%%r10)\n\t"
+                        "vprefetch1\t384(%%r12)\n\t"
+                        "vprefetch1\t512(%%r10)\n\t"
+                        "vprefetch1\t512(%%r12)\n\t"
+                        "jmp\t1f\n\t"
+                        "4:\n\t"
+                        "vprefetch0\t(%[RES])\n\t"
+                        "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"          // otherwise, load our inital state from sum..
+                        "vprefetchnta\t(%%r10)\n\t"
+                        "vprefetchnta\t(%%r12)\n\t"
+                        "1:\n\t"
+                        "cmp\t$3,\t%%r8\n\t"                        // Compare iterations to three.
+                        "jnae\t6f\n\t"                              // If there are not three iterations left, jump to label 6.
+                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"           // Load two vectors.
+                        "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                        "sub\t$3,\t%%r8\n\t"                        // Decrement iterations
+                        "vprefetchnta\t192(%%r10)\n\t"              // prefetch the next float32x16_t block (192 bytes ahead)
+                        "vprefetchnta\t192(%%r12)\n\t"
+                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"         // Load two vectors.
+                        "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                        "vprefetch1\t320(%%r10)\n\t"                // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
+                        "vprefetch1\t320(%%r12)\n\t"
+                        "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"        // Load two vectors.
+                        "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
+                        "vprefetch1\t576(%%r10)\n\t"
+                        "vprefetch1\t576(%%r12)\n\t"
+                        "vprefetch1\t704(%%r10)\n\t"
+                        "vprefetch1\t704(%%r12)\n\t"
+                        "add\t$192,\t%%r10\n\t"                     // Move to the next float32x16_t block (192 bytes ahead)
+                        "add\t$192,\t%%r12\n\t"
+                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "jmp\t1b\n\t"                               // Jump back to the start of the loop
+                        "6:\n\t"                                    // we know we are near the tail. handle 2, 1, and 0 cases.
+                        "cmp\t$0,\t%%r8\n\t"                        // Compare iterations to zero
+                        "je\t2f\n\t"                                // Jump to label 2 if zero (end of loop)
+                        "cmp\t$1,\t%%r8\n\t"                        // Compare iterations to one
+                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"           // Load two vectors.
+                        "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "je\t2f\n\t"                                // Jump to label 3 if one (end of loop)
+                                                                    // No compare. we must be two.
+                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"         // Load two vectors.
+                        "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
+                        "2:\n\t"                                    // Label for loop end
+                        "vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+                        : [RES]  "+r" (sumvec)
+                        : [ITER]  "r"  (iterations),
+                          [VEC1]  "r"  (mvec1),
+                          [VEC2]  "r"  (mvec2),
+                          [CLR]   "r"  (clear),
+                          [Z]     "m"  (zero)
+                        : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
+}
+
+// NOTE: x and y inputs must be __attribute__((aligned(64)));
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc)
+{
   // our sum.
   float32x16_t sum __attribute__((aligned(64)));
 
   // the number of vector-sized steps we will need to do.
-  const uint32_t np = (count & ~(GGML_F32_EPR - 1));
+  const uint32_t np = (n & ~(GGML_F32_EPR - 1));
 
-  GGML_F32x16_VEC_ZERO(&sum);
+  GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1);
 
-  // 0 indexed cycle count
-  //  for (uint32_t cycle = 0; cycle < (np/GGML_F32_EPR); ++cycle)
-  GGML_F32x16_VEC_FMA((float32x16_t *)inVec1, (float32x16_t *)inVec2, &sum, np/GGML_F32_EPR);
-
-  if (count != np)
+  // FIXME: replace this with a final round using masked vectors.
+  if ( n - np != 0 )
     {
-      printf("handling remainder %u\n",count-np);
       // add the leftovers, that could not be handled by the vector loop.
-      // our extended last part of inVec1.
+      // our extended last part of x.
       float32x16_t v1 __attribute__((aligned(64)));
       GGML_F32x16_VEC_ZERO(&v1);
-      // our extended last part of inVec2.
+      // our extended last part of y.
       float32x16_t v2 __attribute__((aligned(64)));
       GGML_F32x16_VEC_ZERO(&v2);
 
-      memcpy(&v1, &inVec1[np], (count - np)*sizeof(float));
-      memcpy(&v2, &inVec2[np], (count - np)*sizeof(float));
+      memcpy(&v1, &x[np], (n - np)*sizeof(float));
+      memcpy(&v2, &y[np], (n - np)*sizeof(float));
 
       GGML_F32x16_VEC_FMA(&v1,
-                         &v2,
-                         &sum, 1);
+                          &v2,
+                          &sum, 1, 0);
+
     }
 
-  // reduce sum0..sumX to sumf
+  // reduce sum, and store it in s.
   for (uint32_t i=0; i <GGML_F32_EPR; ++i)
-    sumf+=((float *)&sum)[i];
+    *s+=((float *)&sum)[i];
 
-  return sumf;
 }
diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
index b0f51ee29..a4b59ae04 100644
--- a/ggml-phi-knc.h
+++ b/ggml-phi-knc.h
@@ -6,11 +6,8 @@
 extern "C" {
 #endif
 
-#include <stdint.h>
-
-GGML_CALL float DotProduct_F32(const float * restrict vec1, const float * restrict vec2, uint32_t count);
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 
 #ifdef  __cplusplus
 }
 #endif
-
diff --git a/ggml.c b/ggml.c
index 708d1698b..7d555c969 100644
--- a/ggml.c
+++ b/ggml.c
@@ -41,6 +41,7 @@
 #pragma warning(disable: 4996)
 #endif
 
+// hand assembled replacement functions are cool.
 #if defined(__k1om__)
 #include <ggml-phi-knc.h>
 #endif
@@ -452,7 +453,11 @@ int64_t ggml_cycles_per_ms(void) {
 
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
+#if defined(__k1om__)
+// We get this function from elsewhere.
+#else
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+#endif
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
 
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
@@ -1334,6 +1339,9 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)
 inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
 inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
 
+#if defined(__k1om__)
+// we get this function from elsewhere.
+#else
 static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
@@ -1366,8 +1374,6 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
     for (int i = np; i < n; ++i) {
         sumf += x[i]*y[i];
     }
-#elif defined(__k1om__)
-    float sumf = DotProduct_F32(x, y, n);
 #else
     // scalar
     ggml_float sumf = 0.0;
@@ -1378,6 +1384,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
 
     *s = sumf;
 }
+#endif
 
 static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
     assert(nrc == 1);

From f882673ba662dbc44c7733450e70e00847b50e06 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 17 Mar 2024 21:20:14 +0000
Subject: [PATCH 014/105] add a benchmark / test binary.

---
 bench-phi-knc.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 bench-phi-knc.c

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
new file mode 100644
index 000000000..7f5431d87
--- /dev/null
+++ b/bench-phi-knc.c
@@ -0,0 +1,64 @@
+#include <immintrin.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h> /*for CLOCK_REALTIME? */
+
+void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+
+#include <time.h>
+
+
+#define MAXVEC 1024768
+#define RUNTOTAL 12
+#define RUNS 
+int main(void)
+{
+  struct timespec start, middle, end;
+  double vector_time;
+  double scalar_time;
+  float scalar = 0.0f;
+  float vector = 0.0f;
+  uint32_t vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
+    {
+      // Generate random input vector of [-1, 1] values.
+      float vec1[MAXVEC] __attribute__((aligned(64)));
+      for (int i = 0; i < vecRuns[runCount]; i++)
+        vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
+  
+      // Generate a second random input vector of [-1, 1] values.
+      float vec2[MAXVEC] __attribute__((aligned(64)));
+      for (int i = 0; i < vecRuns[runCount]; i++)
+        vec2[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
+
+      // on your mark..
+      clock_gettime(CLOCK_MONOTONIC, &start);
+
+      // call dot product
+      ggml_vec_dot_f32(vecRuns[runCount], &vector, 0, vec1, 0, vec2, 0, 0);
+
+      // save the middle point..
+      clock_gettime(CLOCK_MONOTONIC, &middle);
+  
+      // do the same work by hand;
+      for (int i = 0; i < vecRuns[runCount]; ++i)
+        scalar += vec1[i]*vec2[i];
+  
+      clock_gettime(CLOCK_MONOTONIC, &end);
+
+      printf("vector\tvs\tscalar (%d items)\n", vector, scalar, vecRuns[runCount]);
+      printf("%.9f\tvs\t%.9f\n", vector, scalar);
+
+      vector_time = middle.tv_sec - start.tv_sec;
+      vector_time += (middle.tv_nsec - start.tv_nsec) / 1000000000.0;
+
+      scalar_time = end.tv_sec - middle.tv_sec;
+      scalar_time += (end.tv_nsec - middle.tv_nsec) / 1000000000.0;
+
+      printf("%.9f\tvs\t%.9f\n", vector_time, scalar_time);
+    }  
+  fflush(stdout);
+  
+  return 0;
+}

From ab6f3a8a8d38a0552d3e7494727c1bc5705bbe94 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 17 Mar 2024 21:36:14 +0000
Subject: [PATCH 015/105] Update ggml-phi-knc.c

---
 ggml-phi-knc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 648f81bcf..c15456781 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -4,6 +4,9 @@
 
 #include <stdio.h>
 
+// For memcpy.
+#include <string.h>
+
 // No, we have an SIMD unit.
 // #define GGML_SIMD
 

From ee27148629ab9f522f17724a08134f7570353eea Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 20 Mar 2024 20:15:16 +0000
Subject: [PATCH 016/105] remove intrinsics import, and use upConv to save 12
 bytes of memory transit.

---
 ggml-phi-knc.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index c15456781..d0b185899 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,5 +1,3 @@
-#include <immintrin.h>
-
 #include <stdint.h>
 
 #include <stdio.h>
@@ -24,11 +22,10 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
 
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
-  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z],\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
                         "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
                        : [RES]  "+m"  (*target)
                        : [Z]    "m"   (zero)
@@ -38,8 +35,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 // Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
 inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
 {
-  // FIXME: how do we tell GNU AS to perform upconverts? Could remove two memory reads here...
-  float zero[4] __attribute__((aligned(64))) = {0.0f,0.0f,0.0f,0.0f};
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
                         "mov\t%[ITER],%%r8\n\t"                     // how many register sized chunks are we responsible for
@@ -47,7 +43,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                         "mov\t%[VEC2],%%r12\n\t"                    // where do we start work in mvec2?
                         "cmp\t$1,%[CLR]\n\t"                        // should we clear the sum before we start?
                         "jne\t4f\n\t"
-                        "vbroadcastf32x4\t%[Z],\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
                         "vprefetchnta\t(%%r10)\n\t"
                         "vprefetchnta\t(%%r12)\n\t"
                         "vprefetch1\t128(%%r10)\n\t"

From 76e66e77c2cc2015902a8195e352ea0981679b0e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 20 Mar 2024 21:12:22 +0000
Subject: [PATCH 017/105] use the same header as ggml.c, and remove some
 warnings.

---
 bench-phi-knc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index 7f5431d87..ee96715fb 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -3,11 +3,9 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <unistd.h> /*for CLOCK_REALTIME? */
-
-void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
-
 #include <time.h>
 
+#include "ggml-phi-knc.h"
 
 #define MAXVEC 1024768
 #define RUNTOTAL 12
@@ -19,7 +17,7 @@ int main(void)
   double scalar_time;
   float scalar = 0.0f;
   float vector = 0.0f;
-  uint32_t vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  int vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {
       // Generate random input vector of [-1, 1] values.
@@ -47,7 +45,7 @@ int main(void)
   
       clock_gettime(CLOCK_MONOTONIC, &end);
 
-      printf("vector\tvs\tscalar (%d items)\n", vector, scalar, vecRuns[runCount]);
+      printf("vector\tvs\tscalar (%d items)\n", vecRuns[runCount]);
       printf("%.9f\tvs\t%.9f\n", vector, scalar);
 
       vector_time = middle.tv_sec - start.tv_sec;

From ac3637142d51c9625ab6c44ca9f6d0363ee45d57 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 20 Mar 2024 21:34:12 +0000
Subject: [PATCH 018/105] formatting changes.

---
 ggml-phi-knc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index d0b185899..8001b7a84 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -43,7 +43,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                         "mov\t%[VEC2],%%r12\n\t"                    // where do we start work in mvec2?
                         "cmp\t$1,%[CLR]\n\t"                        // should we clear the sum before we start?
                         "jne\t4f\n\t"
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // if so, use an upscaling operator to do it.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
                         "vprefetchnta\t(%%r10)\n\t"
                         "vprefetchnta\t(%%r12)\n\t"
                         "vprefetch1\t128(%%r10)\n\t"
@@ -97,7 +97,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                         "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
                         "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
                         "2:\n\t"                                    // Label for loop end
-                        "vmovaps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+                        "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"        // save our results.
                         : [RES]  "+r" (sumvec)
                         : [ITER]  "r"  (iterations),
                           [VEC1]  "r"  (mvec1),

From 0979522fbe9ea14bfeb3ff88d56f4ce34ec2f43e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 21 Mar 2024 18:36:25 +0000
Subject: [PATCH 019/105] spacing changes.

---
 bench-phi-knc.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index ee96715fb..5a5da5fe5 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -9,7 +9,7 @@
 
 #define MAXVEC 1024768
 #define RUNTOTAL 12
-#define RUNS 
+#define RUNS
 int main(void)
 {
   struct timespec start, middle, end;
@@ -18,13 +18,14 @@ int main(void)
   float scalar = 0.0f;
   float vector = 0.0f;
   int vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {
       // Generate random input vector of [-1, 1] values.
       float vec1[MAXVEC] __attribute__((aligned(64)));
       for (int i = 0; i < vecRuns[runCount]; i++)
         vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
-  
+
       // Generate a second random input vector of [-1, 1] values.
       float vec2[MAXVEC] __attribute__((aligned(64)));
       for (int i = 0; i < vecRuns[runCount]; i++)
@@ -38,11 +39,11 @@ int main(void)
 
       // save the middle point..
       clock_gettime(CLOCK_MONOTONIC, &middle);
-  
+
       // do the same work by hand;
       for (int i = 0; i < vecRuns[runCount]; ++i)
         scalar += vec1[i]*vec2[i];
-  
+
       clock_gettime(CLOCK_MONOTONIC, &end);
 
       printf("vector\tvs\tscalar (%d items)\n", vecRuns[runCount]);
@@ -55,8 +56,9 @@ int main(void)
       scalar_time += (end.tv_nsec - middle.tv_nsec) / 1000000000.0;
 
       printf("%.9f\tvs\t%.9f\n", vector_time, scalar_time);
-    }  
+    }
+
   fflush(stdout);
-  
+
   return 0;
 }

From 9185e149221f398be3d73aafdcc1dc8b3bd61b87 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 21 Mar 2024 20:38:49 +0000
Subject: [PATCH 020/105] be more specific about the length of our list of run
 amounts.

---
 bench-phi-knc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index 5a5da5fe5..4b7f9d192 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -17,7 +17,7 @@ int main(void)
   double scalar_time;
   float scalar = 0.0f;
   float vector = 0.0f;
-  int vecRuns[] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  int vecRuns[RUNSTOTAL] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
 
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {

From a7bd64c130e455fb5e5377ea30593768744b445a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:19:47 +0000
Subject: [PATCH 021/105] begin work on targeting dot_q5_K_q8_K.

---
 Makefile                     |  2 +-
 ggml-phi-knc-dot_q5_K_q8_K.c | 49 ++++++++++++++++++++++++++++++++++++
 ggml-phi-knc-dot_q5_K_q8_K.h | 14 +++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 ggml-phi-knc-dot_q5_K_q8_K.c
 create mode 100644 ggml-phi-knc-dot_q5_K_q8_K.h

diff --git a/Makefile b/Makefile
index 3dbf3f2f0..42861f4b4 100644
--- a/Makefile
+++ b/Makefile
@@ -292,7 +292,7 @@ ifeq "${K1OM}" ""
 	#MK_CFLAGS   += -mssse3
 	#MK_CXXFLAGS += -mssse3
 else
-	OBJS         += ggml-phi-knc.o
+	OBJS         += ggml-phi-knc.o ggml-phi-knc-dot_q5_K_q8_K.o
 	MK_CFLAGS    += -march=knc -mtune=knc
 endif
 
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
new file mode 100644
index 000000000..9104a939c
--- /dev/null
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -0,0 +1,49 @@
+
+/* A forward declaration, to keep GCC happy. */
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
+
+  const block_q5_K * restrict x = vx;
+  const block_q8_K * restrict y = vy;
+  
+  const int nb = n / QK_K;
+  
+  static const uint32_t kmask1 = 0x3f3f3f3f;
+  static const uint32_t kmask2 = 0x0f0f0f0f;
+  static const uint32_t kmask3 = 0x03030303;
+  
+  uint32_t utmp[4];
+    int8_t aux8[QK_K];
+    int16_t aux16[16];
+    float   sums [8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * restrict q4 = x[i].qs;
+        const uint8_t * restrict hm = x[i].qh;
+        const  int8_t * restrict q8 = y[i].qs;
+        int8_t * restrict a = aux8;
+        for (int l = 0; l < 32; ++l) {
+            a[l+ 0] = q4[l] & 0xF;
+            a[l+32] = q4[l]  >> 4;
+        }
+        for (int is = 0; is < 8; ++is) {
+            uint8_t m = 1 << is;
+            for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
+        }
+
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const int8_t * restrict sc = x[i].scales;
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            const float dl = d * sc[j];
+            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l <  8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
+            q8 += 16; a += 16;
+        }
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+}
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
new file mode 100644
index 000000000..b416803e0
--- /dev/null
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/* A forward declaration, to keep GCC happy. */
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
+
+#ifdef  __cplusplus
+}
+#endif

From 9bcb8350d57897b516b60f8fed11a4087b0cbb9d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:28:29 +0000
Subject: [PATCH 022/105] import stdint.h for sizeSt.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 ++
 ggml-phi-knc.c               | 1 +
 2 files changed, 3 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 9104a939c..a3ff0143d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -1,3 +1,5 @@
+// For size_t
+#include <stdint.h>
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 8001b7a84..e5e034bb8 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,3 +1,4 @@
+// For size_t
 #include <stdint.h>
 
 #include <stdio.h>

From 8f57803f58aca40227206dad75c9211589c34aa0 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:29:59 +0000
Subject: [PATCH 023/105] import stdio.h for size_t.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 5 ++++-
 ggml-phi-knc.c               | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a3ff0143d..ec571a3fb 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -1,6 +1,9 @@
-// For size_t
+// For uint32_t
 #include <stdint.h>
 
+// For size_t
+#include <stdio.h>
+
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index e5e034bb8..341bbc01b 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,6 +1,6 @@
-// For size_t
 #include <stdint.h>
 
+// For size_t
 #include <stdio.h>
 
 // For memcpy.

From cd20404250038ad5a03f36a45c0e5886fd35186c Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:38:15 +0000
Subject: [PATCH 024/105] pull in ggml specific types.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index ec571a3fb..bfff9112d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -4,6 +4,9 @@
 // For size_t
 #include <stdio.h>
 
+// For block_q5_K and block_q8_K
+#include "ggml-common.h"
+
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 

From 18f353987c4d55c2c578b13073e56a3eb9c7294c Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:49:35 +0000
Subject: [PATCH 025/105] tell ggml-common.h to export what we want.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index bfff9112d..651ad1684 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -4,6 +4,8 @@
 // For size_t
 #include <stdio.h>
 
+// Yes, we have to tell this header to actually export stuff.
+#define GGML_COMMON_IMPL_C
 // For block_q5_K and block_q8_K
 #include "ggml-common.h"
 

From 0b3f17127fe7261d1a8fadba9420d2ffb2d8e53f Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 14:58:33 +0000
Subject: [PATCH 026/105] force to compile.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 651ad1684..67b9e6025 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -6,7 +6,13 @@
 
 // Yes, we have to tell this header to actually export stuff.
 #define GGML_COMMON_IMPL_C
-// For block_q5_K and block_q8_K
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+
+// FIXME: why do we have to import this twice?
+#define GGML_COMMON_IMPL_C
+// For block_q5_K and block_q8_K. only given the second time.
 #include "ggml-common.h"
 
 /* A forward declaration, to keep GCC happy. */

From 0b012c03efe53524a07d41c56f2270078c30a6a7 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 15:02:56 +0000
Subject: [PATCH 027/105] allow using code from ggml-phi-knc-dot_q5_K_q8_K.c

---
 ggml-quants.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 109dd6660..93e51bb11 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -4,6 +4,7 @@
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 
+// FIXME: why do we import this twice?
 #define GGML_COMMON_IMPL_C
 #include "ggml-common.h"
 
@@ -49,6 +50,11 @@
 #include <riscv_vector.h>
 #endif
 
+// hand assembled replacement functions are cool.
+#if defined(__k1om__)
+#include <ggml-phi-knc-dot_q5_K_q8_K.h>
+#endif
+
 #undef MIN
 #undef MAX
 
@@ -7094,6 +7100,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 }
 #endif
 
+#if defined(__k1om__)
+/* We get this from elsewhere. */
+#else
 #if QK_K == 256
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
     assert(n % QK_K == 0);
@@ -7518,7 +7527,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 #endif
 }
 
-#else
+#else /* QK_K != 256 */
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
@@ -7787,8 +7796,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 #endif
 }
-#endif
+#endif /* end QK_K != 256 */
 
+#endif /* defined(__k1om__) */
 
 #if QK_K == 256
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {

From 0a2051aa88e8bff9109306df7e7bf57ccced63d2 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 15:55:00 +0000
Subject: [PATCH 028/105] attempt to speed up float clearing.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 91 +++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 32 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 67b9e6025..8e659ede8 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -15,51 +15,78 @@
 // For block_q5_K and block_q8_K. only given the second time.
 #include "ggml-common.h"
 
+
+// This SIMD unit can work with 32 float32s at once.
+#define GGML_F32_STEP 32
+// We can fit 16 of these float32s in a single vector register.
+#define GGML_F32_EPR 16
+
+typedef float float32x8_t __attribute__((vector_size (64)));
+
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
+inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint32_t mask=0x000000FF;
+
+  __asm__ __volatile__ (
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"
+                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
+			: [RES]  "+m"  (*target)
+			: [Z]    "m"   (zero)
+			: [M]    "r"   (mask)
+			: "r9", "zmm8", "k1");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
+  /* interpret X and Y as vectors. */
   const block_q5_K * restrict x = vx;
   const block_q8_K * restrict y = vy;
-  
+
+  /* the number of blocks we will process this in. */
   const int nb = n / QK_K;
-  
+
   static const uint32_t kmask1 = 0x3f3f3f3f;
   static const uint32_t kmask2 = 0x0f0f0f0f;
   static const uint32_t kmask3 = 0x03030303;
-  
+
   uint32_t utmp[4];
-    int8_t aux8[QK_K];
-    int16_t aux16[16];
-    float   sums [8];
-    memset(sums, 0, 8*sizeof(float));
+  int8_t aux8[QK_K];
+  int16_t aux16[16];
+  float32x8_t sums __attribute__((aligned(64)));
 
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
-        int8_t * restrict a = aux8;
-        for (int l = 0; l < 32; ++l) {
-            a[l+ 0] = q4[l] & 0xF;
-            a[l+32] = q4[l]  >> 4;
-        }
-        for (int is = 0; is < 8; ++is) {
-            uint8_t m = 1 << is;
-            for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
-        }
+  /* use a vector operation to clear these floats. */
+  GGML_F32x8_VEC_ZERO(&sums);
 
-        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const int8_t * restrict sc = x[i].scales;
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            const float dl = d * sc[j];
-            for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l <  8; ++l) sums[l] += dl * (aux16[l] + aux16[8+l]);
-            q8 += 16; a += 16;
-        }
+  float sumf = 0;
+  for (int i = 0; i < nb; ++i) {
+    const uint8_t * restrict q4 = x[i].qs;
+    const uint8_t * restrict hm = x[i].qh;
+    const  int8_t * restrict q8 = y[i].qs;
+    int8_t * restrict a = aux8;
+    for (int l = 0; l < 32; ++l) {
+      a[l+ 0] = q4[l] & 0xF;
+      a[l+32] = q4[l]  >> 4;
     }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
+    for (int is = 0; is < 8; ++is) {
+      uint8_t m = 1 << is;
+      for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
+    }
+
+    const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+    const int8_t * restrict sc = x[i].scales;
+
+    for (int j = 0; j < QK_K/16; ++j) {
+      const float dl = d * sc[j];
+      for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l <  8; ++l) ((float *)sums)[l] += dl * (aux16[l] + aux16[8+l]);
+      q8 += 16; a += 16;
+    }
+  }
+  for (int l = 0; l < 8; ++l) sumf += ((float *)sums)[l];
+  *s = sumf;
 }

From 6face8a0bebe3e0e24ff2ca8a2b6feae3a2c885d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 15:56:47 +0000
Subject: [PATCH 029/105] first fixes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 8e659ede8..e9ee43844 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -36,8 +36,8 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			"kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
 			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero)
-			: [M]    "r"   (mask)
+			: [Z]    "m"   (zero),
+			  [M]    "r"   (mask)
 			: "r9", "zmm8", "k1");
 }
 
@@ -83,10 +83,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int j = 0; j < QK_K/16; ++j) {
       const float dl = d * sc[j];
       for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)sums)[l] += dl * (aux16[l] + aux16[8+l]);
+      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (aux16[l] + aux16[8+l]);
       q8 += 16; a += 16;
     }
   }
-  for (int l = 0; l < 8; ++l) sumf += ((float *)sums)[l];
+  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From edb76ffddbad7b6269fb0e078a05668bd2deb970 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:19:17 +0000
Subject: [PATCH 030/105] formatting improvement.

---
 ggml-phi-knc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
index a4b59ae04..d2fd11428 100644
--- a/ggml-phi-knc.h
+++ b/ggml-phi-knc.h
@@ -6,7 +6,8 @@
 extern "C" {
 #endif
 
-void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+  /* A forward declaration, to keep GCC happy. */
+  void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
 
 #ifdef  __cplusplus
 }

From e3503c924adf2739fa8ebe3ef7f8454427ed8fed Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:21:20 +0000
Subject: [PATCH 031/105] promote aux16 into a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index e9ee43844..251591214 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,6 +22,7 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
+typedef int16 int16x16_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -56,15 +57,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   uint32_t utmp[4];
   int8_t aux8[QK_K];
-  int16_t aux16[16];
+  //  int16_t aux16[16];
+  int16x16_t aux16;
   float32x8_t sums __attribute__((aligned(64)));
 
   /* use a vector operation to clear these floats. */
   GGML_F32x8_VEC_ZERO(&sums);
 
   float sumf = 0;
+
   for (int i = 0; i < nb; ++i) {
+    // quants, 4 low bits.
     const uint8_t * restrict q4 = x[i].qs;
+    // quants, 1 high bit.
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
     int8_t * restrict a = aux8;
@@ -82,8 +87,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     for (int j = 0; j < QK_K/16; ++j) {
       const float dl = d * sc[j];
-      for (int l = 0; l < 16; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (aux16[l] + aux16[8+l]);
+      for (int l = 0; l < 16; ++l) ((int16 *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16 *)&aux16)[l] + ((int16 *)&aux16)[8+l]);
       q8 += 16; a += 16;
     }
   }

From c72157a5a6fe0cfd3721bc5ff9f111c0ae6bbc50 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:24:11 +0000
Subject: [PATCH 032/105] promote aux16 into a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 251591214..7e149f34d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -58,7 +58,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   uint32_t utmp[4];
   int8_t aux8[QK_K];
   //  int16_t aux16[16];
-  int16x16_t aux16;
+  int16x16_t aux16 __attribute__((aligned(64)));
   float32x8_t sums __attribute__((aligned(64)));
 
   /* use a vector operation to clear these floats. */
@@ -87,8 +87,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     for (int j = 0; j < QK_K/16; ++j) {
       const float dl = d * sc[j];
-      for (int l = 0; l < 16; ++l) ((int16 *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16 *)&aux16)[l] + ((int16 *)&aux16)[8+l]);
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16_t *)&aux16)[l] + ((int16_t *)&aux16)[8+l]);
       q8 += 16; a += 16;
     }
   }

From f092a10dc9261d67e9a0c483595f3ad9356d64c7 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:27:11 +0000
Subject: [PATCH 033/105] promote aux16 into a vector. (part three)

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 7e149f34d..acb965c95 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,7 +22,7 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
-typedef int16 int16x16_t __attribute__((vector_size (64)));
+typedef int16_t int16x16_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);

From e43a63e7c622071a9dc481491f6a586263b922fa Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 16:29:30 +0000
Subject: [PATCH 034/105] fix typo.

---
 bench-phi-knc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index 4b7f9d192..a59e2e5b7 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -17,7 +17,7 @@ int main(void)
   double scalar_time;
   float scalar = 0.0f;
   float vector = 0.0f;
-  int vecRuns[RUNSTOTAL] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  int vecRuns[RUNTOTAL] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
 
   for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
     {

From 31d4f9312be9fd551606c65c1ebf8e05d0863a17 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 19:47:21 +0000
Subject: [PATCH 035/105] copy right block.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 76 +++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index acb965c95..cab3b9dc2 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
-  uint32_t mask=0x000000FF;
+  uint32_t mask=0x0000FF00;
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
@@ -55,43 +55,63 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   static const uint32_t kmask2 = 0x0f0f0f0f;
   static const uint32_t kmask3 = 0x03030303;
 
-  uint32_t utmp[4];
-  int8_t aux8[QK_K];
-  //  int16_t aux16[16];
-  int16x16_t aux16 __attribute__((aligned(64)));
-  float32x8_t sums __attribute__((aligned(64)));
+  const uint8_t * scales = (const uint8_t*)&utmp[0];
+  const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  /* use a vector operation to clear these floats. */
-  GGML_F32x8_VEC_ZERO(&sums);
+  int8_t  aux8[QK_K];
+  int16_t aux16[8];
+  float   sums [8];
+  int32_t aux32[8];
+  memset(sums, 0, 8*sizeof(float));
 
   float sumf = 0;
-
   for (int i = 0; i < nb; ++i) {
-    // quants, 4 low bits.
     const uint8_t * restrict q4 = x[i].qs;
-    // quants, 1 high bit.
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
+    memset(aux32, 0, 8*sizeof(int32_t));
     int8_t * restrict a = aux8;
-    for (int l = 0; l < 32; ++l) {
-      a[l+ 0] = q4[l] & 0xF;
-      a[l+32] = q4[l]  >> 4;
+    uint8_t m = 1;
+    for (int j = 0; j < QK_K/64; ++j) {
+      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
+      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+      a += 32; m <<= 1;
+      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
+      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
+      a += 32; m <<= 1;
+      q4 += 32;
     }
-    for (int is = 0; is < 8; ++is) {
-      uint8_t m = 1 << is;
-      for (int l = 0; l < 8; ++l) a[8*is + l] -= (hm[l] & m ? 0 : 16);
-    }
-
-    const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-    const int8_t * restrict sc = x[i].scales;
-
-    for (int j = 0; j < QK_K/16; ++j) {
-      const float dl = d * sc[j];
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l <  8; ++l) ((float *)&sums)[l] += dl * (((int16_t *)&aux16)[l] + ((int16_t *)&aux16)[8+l]);
-      q8 += 16; a += 16;
+    memcpy(utmp, x[i].scales, 12);
+    utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+    const uint32_t uaux = utmp[1] & kmask1;
+    utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+    utmp[2] = uaux;
+    utmp[0] &= kmask1;
+    
+    int sumi = 0;
+    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+    a = aux8;
+    int is = 0;
+    for (int j = 0; j < QK_K/32; ++j) {
+      int32_t scale = scales[is++];
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
+      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      q8 += 8; a += 8;
     }
+    const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+    for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+    sumf -= dmin * sumi;
   }
-  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
+  for (int l = 0; l < 8; ++l) sumf += sums[l];
   *s = sumf;
 }

From f985372e3aa13a4fd4f7b0655281cc09a0f3b446 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 19:49:16 +0000
Subject: [PATCH 036/105] add missing variable.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index cab3b9dc2..668bae93b 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -55,6 +55,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   static const uint32_t kmask2 = 0x0f0f0f0f;
   static const uint32_t kmask3 = 0x03030303;
 
+  uint32_t utmp[4];
+
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 

From bd6d7e6238d1c6647682fb3dc1d8c4f6abe59457 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 19:55:12 +0000
Subject: [PATCH 037/105] try to use vectorized zeroing function.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 668bae93b..68c1aa965 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
-  uint32_t mask=0x0000FF00;
+  uint32_t mask=0x0000000F;
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
@@ -62,9 +62,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   int8_t  aux8[QK_K];
   int16_t aux16[8];
-  float   sums [8];
+  float32x8_t sums;
   int32_t aux32[8];
-  memset(sums, 0, 8*sizeof(float));
+
+  //memset(sums, 0, 8*sizeof(float));
+
+  GGML_F32x8_VEC_ZERO(&sums);
 
   float sumf = 0;
   for (int i = 0; i < nb; ++i) {
@@ -110,10 +113,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * aux32[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  for (int l = 0; l < 8; ++l) sumf += sums[l];
+  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From 9d7ca41703892293073d539bcaa268e888462aca Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 20:48:43 +0000
Subject: [PATCH 038/105] expand mask, and align memory.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 68c1aa965..a9a9c0ae9 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,7 +30,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
-  uint32_t mask=0x0000000F;
+  uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
@@ -39,7 +39,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: [RES]  "+m"  (*target)
 			: [Z]    "m"   (zero),
 			  [M]    "r"   (mask)
-			: "r9", "zmm8", "k1");
+			: "zmm8", "k1", memory);
 }
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
@@ -62,11 +62,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   int8_t  aux8[QK_K];
   int16_t aux16[8];
-  float32x8_t sums;
+  float32x8_t sums __attribute__((aligned(64)));
   int32_t aux32[8];
 
-  //memset(sums, 0, 8*sizeof(float));
-
   GGML_F32x8_VEC_ZERO(&sums);
 
   float sumf = 0;

From bb5eb95816d38aab2cb70bb11b3026b4f5181d8e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 20:49:11 +0000
Subject: [PATCH 039/105] use better memory save operator.

---
 ggml-phi-knc.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 341bbc01b..e767e2306 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -6,9 +6,6 @@
 // For memcpy.
 #include <string.h>
 
-// No, we have an SIMD unit.
-// #define GGML_SIMD
-
 // This SIMD unit can work with 32 float32s at once.
 #define GGML_F32_STEP 32
 // We can fit 16 of these float32s in a single vector register.
@@ -27,7 +24,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 
   __asm__ __volatile__ (
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                        "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
                        : [RES]  "+m"  (*target)
                        : [Z]    "m"   (zero)
                        : "zmm8");

From f09b3ed79ebd6d9bf767976f57e9b1caac32b273 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 20:53:16 +0000
Subject: [PATCH 040/105] use quotes properly.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a9a9c0ae9..b4049e9b5 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -39,7 +39,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: [RES]  "+m"  (*target)
 			: [Z]    "m"   (zero),
 			  [M]    "r"   (mask)
-			: "zmm8", "k1", memory);
+			: "zmm8", "k1", "memory");
 }
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {

From 2fdd11fe3a65f043a54e6950257512c34162f6eb Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:00:51 +0000
Subject: [PATCH 041/105] promote aux16 to a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index b4049e9b5..1443398ff 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,7 +22,7 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
-typedef int16_t int16x16_t __attribute__((vector_size (64)));
+typedef int16_t int16x8_t __attribute__((vector_size (32)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -61,7 +61,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
   int8_t  aux8[QK_K];
-  int16_t aux16[8];
+  int16x8_t aux16 __attribute__((aligned(64)));
   float32x8_t sums __attribute__((aligned(64)));
   int32_t aux32[8];
 
@@ -97,17 +97,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From f967690a415ee72efb7d6ea7e7292d084bc5d278 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:05:50 +0000
Subject: [PATCH 042/105] add missing address of operators.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 1443398ff..c10852c57 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -97,17 +97,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From ea1edb0600c746b60bce05cccc0567d832c27725 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:12:35 +0000
Subject: [PATCH 043/105] promote aux32 to a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index c10852c57..dac4b3257 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -23,6 +23,7 @@
 
 typedef float float32x8_t __attribute__((vector_size (64)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
+typedef int32_t int32x8_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -63,7 +64,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   int8_t  aux8[QK_K];
   int16x8_t aux16 __attribute__((aligned(64)));
   float32x8_t sums __attribute__((aligned(64)));
-  int32_t aux32[8];
+  int32x8_t aux32 __attribute__((aligned(64)));
 
   GGML_F32x8_VEC_ZERO(&sums);
 
@@ -98,20 +99,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) aux32[l] += scale * ((int16_t *)&aux16)[l];
+      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * aux32[l];
+    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }

From 4477b8e123c960936e9ec31f20d1f1644ca8b176 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 21:16:23 +0000
Subject: [PATCH 044/105] add I32 vector memory clearing.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index dac4b3257..b2a7f3106 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -43,6 +43,21 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint32_t mask=0x000000FF;
+
+  __asm__ __volatile__ (
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"
+                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
+			: [RES]  "+m"  (*target)
+			: [Z]    "m"   (zero),
+			  [M]    "r"   (mask)
+			: "zmm8", "k1", "memory");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -73,7 +88,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict q4 = x[i].qs;
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
-    memset(aux32, 0, 8*sizeof(int32_t));
+    GGML_I32x8_VEC_ZERO(&aux32);
+
     int8_t * restrict a = aux8;
     uint8_t m = 1;
     for (int j = 0; j < QK_K/64; ++j) {

From a5132a15071280529271f12c9f51aab3d69ae650 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 22:16:57 +0000
Subject: [PATCH 045/105] attempt our first FMA.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index b2a7f3106..adbb55b4b 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -58,6 +58,31 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+// perform an eight wide Fused Multiply Add of an I16x8 times scalar S into I32x8.
+inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint32_t mask=0x000000FF;
+  int32_t scaleVec[4] = {scale, scale, scale, scale};
+
+  __asm__ __volatile__ (
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm1\n\t"        // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm2\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
+			"vmovaps\t\t%[SRC]%{int16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
+			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
+                        "vmovaps\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
+			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
+			"vmovaps\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
+			: [RES]   "+m" (*target)
+			: [Z]     "m"  (zero),
+			  [M]     "r"  (mask),
+			  [SRC]   "m"  (src),
+			  [SCALE] "m"  (scaleVec)
+			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -124,7 +149,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
+      GGML_I16x8_S_FMA_I32x8 (aux16, scale, aux32);
+      //      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From 5935bb34f49ec2e91f55c4f6cd037b4b70a34a49 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 23 Mar 2024 23:46:36 +0000
Subject: [PATCH 046/105] use proper mov operator, and pass addresses.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index adbb55b4b..ab64198c0 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -70,12 +70,12 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
                         "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm1\n\t"        // use an upscaling operator to clear our value.
                         "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm2\n\t"        // use an upscaling operator to clear our value.
 			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-			"vmovaps\t\t%[SRC]%{int16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
+			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
 			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
-                        "vmovaps\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
+                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
 			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-			"vmovaps\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
-			: [RES]   "+m" (*target)
+			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
+			: [RES]   "+m" (*dest)
 			: [Z]     "m"  (zero),
 			  [M]     "r"  (mask),
 			  [SRC]   "m"  (src),
@@ -149,8 +149,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
       q8 += 8; a += 8;
       for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x8 (aux16, scale, aux32);
-      //      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
+      GGML_I16x8_S_FMA_I32x8 (&aux16, scale, &aux32);
       q8 += 8; a += 8;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From 03a3e0eb7aab095d02bde39af7ed9217d318ce9d Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:04:44 +0000
Subject: [PATCH 047/105] perform 16 operations at a time.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 77 ++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index ab64198c0..37f7cb8fa 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -24,6 +24,8 @@
 typedef float float32x8_t __attribute__((vector_size (64)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
 typedef int32_t int32x8_t __attribute__((vector_size (64)));
+typedef int16_t int16x16_t __attribute__((vector_size (64)));
+typedef int32_t int32x16_t __attribute__((vector_size (128)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -58,6 +60,19 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+inline static void GGML_I32x16_VEC_ZERO(int32x8_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+
+  __asm__ __volatile__ (
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+			"kmov\t%[M],\t%%k1\n\t"
+                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
+			: [RES]  "+m"  (*target)
+			: [Z]    "m"   (zero)
+			: "zmm8", "k1", "memory");
+}
+
 // perform an eight wide Fused Multiply Add of an I16x8 times scalar S into I32x8.
 inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
 {
@@ -66,15 +81,12 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm0\n\t"        // use an upscaling operator to clear our value.
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm1\n\t"        // use an upscaling operator to clear our value.
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm2\n\t"        // use an upscaling operator to clear our value.
 			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t"    // load the item we will be summing from. upscale it from int16.
+			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
 			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
-                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"             // load the item we will be summing onto.
+                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"           // load the item we will be summing onto.
 			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"              // save the result.
+			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
 			: [RES]   "+m" (*dest)
 			: [Z]     "m"  (zero),
 			  [M]     "r"  (mask),
@@ -83,6 +95,23 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
 			: "zmm0", "zmm1", "zmm2", "k1", "memory");
 }
 
+// perform an eight wide Fused Multiply Add of an I16x16 times scalar S into I32x16.
+inline static void GGML_I16x16_S_FMA_I32x16 (int16x8_t *src, int32_t scale, int32x8_t *dest)
+{
+  int32_t scaleVec[4] = {scale, scale, scale, scale};
+
+  __asm__ __volatile__ (
+			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
+			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
+                        "vmovdqa32\t\t%[RES],\t%%zmm2\n\t"           // load the item we will be summing onto.
+			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
+			"vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
+			: [RES]   "+m" (*dest)
+			: [SRC]   "m"  (src),
+			  [SCALE] "m"  (scaleVec)
+			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+}
+
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -101,19 +130,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  int8_t  aux8[QK_K];
-  int16x8_t aux16 __attribute__((aligned(64)));
-  float32x8_t sums __attribute__((aligned(64)));
-  int32x8_t aux32 __attribute__((aligned(64)));
+  int8_t aux8[QK_K];
+  int16x16_t aux16 __attribute__((aligned(128)));
+  float32x16_t sums __attribute__((aligned(64)));
+  int32x16_t aux32 __attribute__((aligned(128)));
 
-  GGML_F32x8_VEC_ZERO(&sums);
+  GGML_F32x16_VEC_ZERO(&sums);
 
   float sumf = 0;
   for (int i = 0; i < nb; ++i) {
     const uint8_t * restrict q4 = x[i].qs;
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
-    GGML_I32x8_VEC_ZERO(&aux32);
+
+    GGML_I32x16_VEC_ZERO(&aux32);
 
     int8_t * restrict a = aux8;
     uint8_t m = 1;
@@ -139,24 +169,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
-      q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
-      q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      for (int l = 0; l < 8; ++l) ((int32_t *)&aux32)[l] += scale * ((int16_t *)&aux16)[l];
-      q8 += 8; a += 8;
-      for (int l = 0; l < 8; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x8 (&aux16, scale, &aux32);
-      q8 += 8; a += 8;
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      q8 += 16; a += 16;
+      /* FIXME: while comparing FMA output to normal output, the original had an error. hunt it down. */
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      q8 += 16; a += 16;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 8; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
+    for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  for (int l = 0; l < 8; ++l) sumf += ((float *)&sums)[l];
+  for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From ba4f4129b362fd16c336ca74d4a3ef4aaffe27a9 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:17:06 +0000
Subject: [PATCH 048/105] better comments, and fix some small errors.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 37f7cb8fa..66f1f1622 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -30,6 +30,7 @@ typedef int32_t int32x16_t __attribute__((vector_size (128)));
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
+/* clear a vector of 8 floats. */
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
@@ -45,6 +46,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
+/* clear a vector of 8 int32_ts. */
 inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
@@ -60,7 +62,8 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
-inline static void GGML_I32x16_VEC_ZERO(int32x8_t *target)
+/* clear a vector of 16 int32_ts. */
+inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
@@ -73,7 +76,7 @@ inline static void GGML_I32x16_VEC_ZERO(int32x8_t *target)
 			: "zmm8", "k1", "memory");
 }
 
-// perform an eight wide Fused Multiply Add of an I16x8 times scalar S into I32x8.
+// perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
 inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
 {
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
@@ -95,8 +98,8 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
 			: "zmm0", "zmm1", "zmm2", "k1", "memory");
 }
 
-// perform an eight wide Fused Multiply Add of an I16x16 times scalar S into I32x16.
-inline static void GGML_I16x16_S_FMA_I32x16 (int16x8_t *src, int32_t scale, int32x8_t *dest)
+// perform a Fused Multiply Add of an I16x16 times scalar S into I32x16.
+inline static void GGML_I16x16_S_FMA_I32x16 (int16x16_t *src, int32_t scale, int32x16_t *dest)
 {
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
@@ -131,8 +134,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
   int8_t aux8[QK_K];
-  int16x16_t aux16 __attribute__((aligned(128)));
-  float32x16_t sums __attribute__((aligned(64)));
+  float32x16_t sums __attribute__((aligned(128)));
+  int16x16_t aux16 __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
 
   GGML_F32x16_VEC_ZERO(&sums);
@@ -143,8 +146,6 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
 
-    GGML_I32x16_VEC_ZERO(&aux32);
-
     int8_t * restrict a = aux8;
     uint8_t m = 1;
     for (int j = 0; j < QK_K/64; ++j) {
@@ -164,17 +165,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     utmp[0] &= kmask1;
     
     int sumi = 0;
+
+    GGML_I32x16_VEC_ZERO(&aux32);
+
     for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     a = aux8;
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
       /* FIXME: while comparing FMA output to normal output, the original had an error. hunt it down. */
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x8_S_FMA_I32x16 (&aux16, scale, &aux32);
+      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
     }
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;

From c28bfe4552de457578e6f83d0111c44a42079230 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:37:47 +0000
Subject: [PATCH 049/105] spacing changes, eliminate dead references to k1 or
 zero, and use the right type when referring to src.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 78 ++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 66f1f1622..a067a8724 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -22,9 +22,10 @@
 #define GGML_F32_EPR 16
 
 typedef float float32x8_t __attribute__((vector_size (64)));
+typedef float float32x16_t __attribute__((vector_size (128)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
-typedef int32_t int32x8_t __attribute__((vector_size (64)));
 typedef int16_t int16x16_t __attribute__((vector_size (64)));
+typedef int32_t int32x8_t __attribute__((vector_size (64)));
 typedef int32_t int32x16_t __attribute__((vector_size (128)));
 
 /* A forward declaration, to keep GCC happy. */
@@ -37,13 +38,13 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-			"kmov\t%[M],\t%%k1\n\t"
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero),
-			  [M]    "r"   (mask)
-			: "zmm8", "k1", "memory");
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero),
+                          [M]    "r"   (mask)
+                        : "zmm8", "k1", "memory");
 }
 
 /* clear a vector of 8 int32_ts. */
@@ -53,13 +54,13 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-			"kmov\t%[M],\t%%k1\n\t"
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero),
-			  [M]    "r"   (mask)
-			: "zmm8", "k1", "memory");
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero),
+                          [M]    "r"   (mask)
+                        : "zmm8", "k1", "memory");
 }
 
 /* clear a vector of 16 int32_ts. */
@@ -68,12 +69,11 @@ inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
-			"kmov\t%[M],\t%%k1\n\t"
-                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-			: [RES]  "+m"  (*target)
-			: [Z]    "m"   (zero)
-			: "zmm8", "k1", "memory");
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero)
+                        : "zmm8", "memory");
 }
 
 // perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
@@ -84,18 +84,18 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
   __asm__ __volatile__ (
-			"kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
-			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
+                        "kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
+                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
+                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
                         "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"           // load the item we will be summing onto.
-			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-			"vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
-			: [RES]   "+m" (*dest)
-			: [Z]     "m"  (zero),
-			  [M]     "r"  (mask),
-			  [SRC]   "m"  (src),
-			  [SCALE] "m"  (scaleVec)
-			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
+                        "vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
+                        : [RES]   "+m" (*dest)
+                        : [Z]     "m"  (zero),
+                          [M]     "r"  (mask),
+                          [SRC]   "m"  (src),
+                          [SCALE] "m"  (scaleVec)
+                        : "zmm0", "zmm1", "zmm2", "k1", "memory");
 }
 
 // perform a Fused Multiply Add of an I16x16 times scalar S into I32x16.
@@ -104,15 +104,15 @@ inline static void GGML_I16x16_S_FMA_I32x16 (int16x16_t *src, int32_t scale, int
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
   __asm__ __volatile__ (
-			"vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
-			"vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
+                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
+                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
                         "vmovdqa32\t\t%[RES],\t%%zmm2\n\t"           // load the item we will be summing onto.
-			"vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
-			"vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
-			: [RES]   "+m" (*dest)
-			: [SRC]   "m"  (src),
-			  [SCALE] "m"  (scaleVec)
-			: "zmm0", "zmm1", "zmm2", "k1", "memory");
+                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
+                        "vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
+                        : [RES]   "+m" (*dest)
+                        : [SRC]   "m"  (*src),
+                          [SCALE] "m"  (scaleVec)
+                        : "zmm0", "zmm1", "zmm2", "memory");
 }
 
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
@@ -176,8 +176,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
       GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
-      /* FIXME: while comparing FMA output to normal output, the original had an error. hunt it down. */
       for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
+      // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
       GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
       q8 += 16; a += 16;
     }

From 169a1454092ee868d51985d788e6f6f14b8273f1 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:41:21 +0000
Subject: [PATCH 050/105] fix our reference to src in the second place, and use
 a more accurate comment.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a067a8724..8688836b9 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -38,7 +38,7 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
                         : [RES]  "+m"  (*target)
@@ -54,7 +54,7 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "kmov\t%[M],\t%%k1\n\t"
                         "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
                         : [RES]  "+m"  (*target)
@@ -69,7 +69,7 @@ inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
                         : [RES]  "+m"  (*target)
                         : [Z]    "m"   (zero)
@@ -93,7 +93,7 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
                         : [RES]   "+m" (*dest)
                         : [Z]     "m"  (zero),
                           [M]     "r"  (mask),
-                          [SRC]   "m"  (src),
+                          [SRC]   "m"  (*src),
                           [SCALE] "m"  (scaleVec)
                         : "zmm0", "zmm1", "zmm2", "k1", "memory");
 }

From cf481cf9017e32508f901c36af761b313cd70938 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 12:50:01 +0000
Subject: [PATCH 051/105] promote aux8 into a vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 8688836b9..66c0f3b58 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -23,6 +23,7 @@
 
 typedef float float32x8_t __attribute__((vector_size (64)));
 typedef float float32x16_t __attribute__((vector_size (128)));
+typedef int8_t int8x16_t __attribute__((vector_size (32)));
 typedef int16_t int16x8_t __attribute__((vector_size (32)));
 typedef int16_t int16x16_t __attribute__((vector_size (64)));
 typedef int32_t int32x8_t __attribute__((vector_size (64)));
@@ -79,7 +80,6 @@ inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
 // perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
 inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
   uint32_t mask=0x000000FF;
   int32_t scaleVec[4] = {scale, scale, scale, scale};
 
@@ -91,8 +91,7 @@ inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x
                         "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
                         "vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
                         : [RES]   "+m" (*dest)
-                        : [Z]     "m"  (zero),
-                          [M]     "r"  (mask),
+                        : [M]     "r"  (mask),
                           [SRC]   "m"  (*src),
                           [SCALE] "m"  (scaleVec)
                         : "zmm0", "zmm1", "zmm2", "k1", "memory");
@@ -134,6 +133,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
   int8_t aux8[QK_K];
+  int8x16_t aux8x16[QK_K/16] __attribute__((aligned(32)));
   float32x16_t sums __attribute__((aligned(128)));
   int16x16_t aux16 __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
@@ -146,7 +146,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
 
-    int8_t * restrict a = aux8;
+    int8_t * restrict a = aux8_16;
     uint8_t m = 1;
     for (int j = 0; j < QK_K/64; ++j) {
       for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -169,7 +169,6 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     GGML_I32x16_VEC_ZERO(&aux32);
 
     for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-    a = aux8;
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
       int32_t scale = scales[is++];

From ca0dc26704cf9c7d40e6de691ff462efaeecebca Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 13:35:05 +0000
Subject: [PATCH 052/105] loosen alignment requirements for zeros, add missing
 function, and promote aux8 to an array of vectors.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 66c0f3b58..26e03d241 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -35,7 +35,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 /* clear a vector of 8 floats. */
 inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
@@ -48,10 +48,23 @@ inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
                         : "zmm8", "k1", "memory");
 }
 
+/* clear a vector of 16 floats. */
+inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
+{
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
+
+  __asm__ __volatile__ (
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
+                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                        : [RES]  "+m"  (*target)
+                        : [Z]    "m"   (zero)
+                        : "zmm8", "memory");
+}
+
 /* clear a vector of 8 int32_ts. */
 inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
   uint32_t mask=0x000000FF;
 
   __asm__ __volatile__ (
@@ -67,7 +80,7 @@ inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
 /* clear a vector of 16 int32_ts. */
 inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
 
   __asm__ __volatile__ (
                         "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
@@ -132,9 +145,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  int8_t aux8[QK_K];
-  int8x16_t aux8x16[QK_K/16] __attribute__((aligned(32)));
   float32x16_t sums __attribute__((aligned(128)));
+  int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
   int16x16_t aux16 __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
 
@@ -146,8 +158,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint8_t * restrict hm = x[i].qh;
     const  int8_t * restrict q8 = y[i].qs;
 
-    int8_t * restrict a = aux8_16;
+    int8_t * restrict a = (int8_t * restrict)aux8;
     uint8_t m = 1;
+
+    // Fill the 8 bit vector a with our 5 bit quantization data, 64 blocks at a time.
     for (int j = 0; j < QK_K/64; ++j) {
       for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
       for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
@@ -157,12 +171,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
       a += 32; m <<= 1;
       q4 += 32;
     }
+
     memcpy(utmp, x[i].scales, 12);
     utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
     const uint32_t uaux = utmp[1] & kmask1;
     utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
     utmp[2] = uaux;
     utmp[0] &= kmask1;
+
+    a = (int8_t * restrict)aux8;
     
     int sumi = 0;
 

From bc3d6db8624170bc60efab5dca66f16fec06b9d9 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 24 Mar 2024 14:18:08 +0000
Subject: [PATCH 053/105] separate filling aux16 from consuming aux16 by making
 it an array of vectors.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 26e03d241..eebd12d89 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -147,7 +147,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
   float32x16_t sums __attribute__((aligned(128)));
   int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
-  int16x16_t aux16 __attribute__((aligned(64)));
+  int16x16_t aux16[QK_K/16] __attribute__((aligned(64)));
   int32x16_t aux32 __attribute__((aligned(128)));
 
   GGML_F32x16_VEC_ZERO(&sums);
@@ -188,15 +188,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     int is = 0;
     for (int j = 0; j < QK_K/32; ++j) {
-      int32_t scale = scales[is++];
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[j*2])[l] = q8[l] * a[l];
       q8 += 16; a += 16;
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16)[l] = q8[l] * a[l];
-      // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
-      GGML_I16x16_S_FMA_I32x16 (&aux16, scale, &aux32);
+      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[(j*2)+1])[l] = q8[l] * a[l];
       q8 += 16; a += 16;
     }
+
+    // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
+    for (int j = 0; j < QK_K/32; ++j) {
+      int32_t scale = scales[is++];
+      GGML_I16x16_S_FMA_I32x16 (&aux16[j*2], scale, &aux32);
+      GGML_I16x16_S_FMA_I32x16 (&aux16[(j*2)+1], scale, &aux32);
+    }
+
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
     for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;

From 12c9576aeca0a11109f5349baf7bdba377ec4353 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Mon, 25 Mar 2024 19:43:37 +0000
Subject: [PATCH 054/105] fix vector sizes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index eebd12d89..418fa772d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -15,19 +15,18 @@
 // For block_q5_K and block_q8_K. only given the second time.
 #include "ggml-common.h"
 
-
 // This SIMD unit can work with 32 float32s at once.
 #define GGML_F32_STEP 32
 // We can fit 16 of these float32s in a single vector register.
 #define GGML_F32_EPR 16
 
-typedef float float32x8_t __attribute__((vector_size (64)));
-typedef float float32x16_t __attribute__((vector_size (128)));
-typedef int8_t int8x16_t __attribute__((vector_size (32)));
-typedef int16_t int16x8_t __attribute__((vector_size (32)));
-typedef int16_t int16x16_t __attribute__((vector_size (64)));
-typedef int32_t int32x8_t __attribute__((vector_size (64)));
-typedef int32_t int32x16_t __attribute__((vector_size (128)));
+typedef float float32x8_t __attribute__((vector_size (32)));
+typedef float float32x16_t __attribute__((vector_size (64)));
+typedef int8_t int8x16_t __attribute__((vector_size (16)));
+typedef int16_t int16x8_t __attribute__((vector_size (16)));
+typedef int16_t int16x16_t __attribute__((vector_size (32)));
+typedef int32_t int32x8_t __attribute__((vector_size (32)));
+typedef int32_t int32x16_t __attribute__((vector_size (64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
@@ -145,10 +144,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  float32x16_t sums __attribute__((aligned(128)));
-  int8x16_t aux8[QK_K/16] __attribute__((aligned(32)));
-  int16x16_t aux16[QK_K/16] __attribute__((aligned(64)));
-  int32x16_t aux32 __attribute__((aligned(128)));
+  float32x16_t sums __attribute__((aligned(64)));
+  int8x16_t aux8[QK_K/16] __attribute__((aligned(16)));
+  int16x16_t aux16[QK_K/16] __attribute__((aligned(32)));
+  int32x16_t aux32 __attribute__((aligned(64)));
 
   GGML_F32x16_VEC_ZERO(&sums);
 

From 9f569ca50b03dca2b494bf0a641fa68703557d15 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 2 Apr 2024 15:41:56 +0000
Subject: [PATCH 055/105] massively rewrite assembly routines.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 237 ++++++++++++++++++++---------------
 1 file changed, 135 insertions(+), 102 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 418fa772d..1145dfff7 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -20,112 +20,154 @@
 // We can fit 16 of these float32s in a single vector register.
 #define GGML_F32_EPR 16
 
+/* we force an alignment, because i haven't written unaligned forms of the assembly functions, yet.. */
 typedef float float32x8_t __attribute__((vector_size (32)));
-typedef float float32x16_t __attribute__((vector_size (64)));
-typedef int8_t int8x16_t __attribute__((vector_size (16)));
+typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
+typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
+typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
 typedef int16_t int16x8_t __attribute__((vector_size (16)));
 typedef int16_t int16x16_t __attribute__((vector_size (32)));
 typedef int32_t int32x8_t __attribute__((vector_size (32)));
-typedef int32_t int32x16_t __attribute__((vector_size (64)));
+typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
 
 /* A forward declaration, to keep GCC happy. */
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
 
-/* clear a vector of 8 floats. */
-inline static void GGML_F32x8_VEC_ZERO(float32x8_t *target)
-{
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
-  uint32_t mask=0x000000FF;
-
-  __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                        "kmov\t%[M],\t%%k1\n\t"
-                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-                        : [RES]  "+m"  (*target)
-                        : [Z]    "m"   (zero),
-                          [M]    "r"   (mask)
-                        : "zmm8", "k1", "memory");
-}
-
 /* clear a vector of 16 floats. */
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
+  uint8_t zero=0;
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
+                        "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
                         "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
                         : [RES]  "+m"  (*target)
                         : [Z]    "m"   (zero)
                         : "zmm8", "memory");
 }
 
-/* clear a vector of 8 int32_ts. */
-inline static void GGML_I32x8_VEC_ZERO(int32x8_t *target)
+// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16.
+// it loops 8 times. well, actually four, with an unroll.
+inline static void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16 (int8x16_t *src11, uint8x16_t *src21, const uint8_t *scale, int32x16_t *res)
 {
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
-  uint32_t mask=0x000000FF;
+  uint8_t zero = 0;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                        "kmov\t%[M],\t%%k1\n\t"
-                        "vmovaps\t\t%%zmm8,\t%[RES]%{%%k1%}\n\t"
-                        : [RES]  "+m"  (*target)
-                        : [Z]    "m"   (zero),
-                          [M]    "r"   (mask)
-                        : "zmm8", "k1", "memory");
+			"vprefetche0\t(%[SRC11])\n\t"
+			"vprefetche0\t(%[SRC21])\n\t"
+			"vprefetche0\t(%[SCALE])\n\t"
+			"mov\t$0,\t%%ecx\n\t"
+			"mov\t%[SRC11],\t%%r12\n\t"
+			"mov\t%[SRC21],\t%%r8\n\t"
+			"mov\t%[SCALE],\t%%r9\n\t"
+			"vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"     // empty our result.
+
+			"1:\n\t"
+			"inc\t%%ecx\n\t"                               // we are in our loop, increment our counter.
+			"cmp\t$4,\t%%ecx\n\t"                          // see if this is our last run-through.
+			"vmovdqa32\t\t(%%r12)%{sint8%},\t%%zmm0\n\t"   // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm1\n\t"    // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+			"vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm6\n\t"   // load the item we will be multiplying by.
+			"vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"vmovdqa32\t\t16(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+			"vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"vmovdqa32\t\t32(%%r12)%{sint8%},\t%%zmm8\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm1\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm8,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+			"vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm6\n\t"  // load the item we will be multiplying by.
+			"vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"vmovdqa32\t\t48(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			"vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			"vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+			"vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			"je\t2f\n\t"                                   // if this is the last time through our loop, jump to 2.
+			"vprefetche0\t64(%%r12)\n\t"                   // otherwise, prepare for another run-through.
+			"vprefetche0\t64(%%r8)\n\t"
+			"vprefetche2\t128(%%r12)\n\t"
+			"vprefetche2\t128(%%r8)\n\t"
+			"add\t$64,\t%%r12\n\t"
+			"add\t$64,\t%%r8\n\t"
+			"add\t$2,\t%%r9\n\t"
+			"jmp\t1b\n\t"
+			"2:\n\t"
+			"vmovdqa32\t\t%%zmm7,\t(%[RES])\n\t"           // save the result.
+			: [RES]   "+r" (res)
+			: [SRC11] "r"  (src11),
+			  [SRC21] "r"  (src21),
+			  [SCALE] "r"  (scale),
+			  [Z]     "m"  (zero)
+			: "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "ecx", "r8", "r9", "r12", "memory");
 }
 
-/* clear a vector of 16 int32_ts. */
-inline static void GGML_I32x16_VEC_ZERO(int32x16_t *target)
+// Unpack 256 unsigned 5 bit values into an 8 bit vector.
+inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
 {
-  uint8_t zero[4] __attribute__((aligned(32))) = {0,0,0,0};
+  uint8_t lowmask = 0x0F;
+  uint32_t allmask=0xFFFFFFFF;
+  uint8_t m=1;
+  uint8_t bit5 = 0x10;
 
   __asm__ __volatile__ (
-                        "vbroadcastI32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
-                        : [RES]  "+m"  (*target)
-                        : [Z]    "m"   (zero)
-                        : "zmm8", "memory");
+			"vprefetche0\t(%[SRC1])\n\t"
+			"vprefetche0\t(%[SRC4])\n\t"
+			"vprefetche1\t64(%[SRC4])\n\t"
+			"mov\t%[SRC4],\t%%r12\n\t"                       // load the address of the head of our 4-bit list.
+			"mov\t%[DST],\t%%r8\n\t"                         // load the address of the head of our destination list.
+			"mov\t$0,%%ecx\n\t"                              // initialize our counter.
+			"vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm6\n\t"     // move 16 packed sets of single bits into the lower 8 bits of zmm6.
+			"vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm7\n\t"   // move the next 16 packed sets of single bits into the lower 8 bits of zmm7.
+			"vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t "   // load our mask.
+			"vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t"     // load the bit we want to add (conditionally).
+			"vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t"       // select which bit we want to test for.
+
+			"1:\n\t"
+			"inc\t%%ecx\n\t"                                 // we are in the loop. increment the counter.
+
+			"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+			"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+			"vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t"     // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+			"vpandd\t%%zmm0,\t%%zmm2,\t%%zmm4\n\t"           // apply a mask, storing the low four bits of vector zmm0 into zmm4.
+			"vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+			"vmovdqa32\t\t16(%%r12)%{uint8%},\t%%zmm1\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+			"vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t"           // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
+			"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+
+			"add\t$32,\t%%r8\n\t"
+			"cmp\t$4,\t%%ecx\n\t"
+			"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+
+			"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+			"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+			"vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t"               // load our even 4 bit sequence into zmm4.
+			"vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+			"vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t"               // load our even 4 bit sequence into zmm5.
+			"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+
+			"je\t2f\n\t"
+
+			"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+			"add\t$32,\t%%r12\n\t"
+			"add\t$32,\t%%r8\n\t"
+			"jmp\t1b\n\t"
+			"2:"
+			: [DST]  "+r" (dst)
+			: [SRC4]  "r" (q4),
+			  [SRC1]  "r" (q1),
+			  [MASK]  "m" (lowmask),
+			  [M]     "m" (m),
+			  [ALL]   "m" (allmask),
+			  [BIT5]  "m" (bit5)
+			: "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "ecx", "k1", "k2", "r12", "r8", "memory"
+			);
 }
-
-// perform a Fused Multiply Add of an I16x8 times scalar S into I32x8.
-inline static void GGML_I16x8_S_FMA_I32x8 (int16x8_t *src, int32_t scale, int32x8_t *dest)
-{
-  uint32_t mask=0x000000FF;
-  int32_t scaleVec[4] = {scale, scale, scale, scale};
-
-  __asm__ __volatile__ (
-                        "kmov\t%[M],\t%%k1\n\t"                              // we will only be working with 8 values at a time. le sigh.
-                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0%{%%k1%}\n\t" // load the item we will be summing from. upscale it from int16.
-                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"             // load the item we will be multiplying by.
-                        "vmovdqa32\t\t%[RES],\t%%zmm2%{%%k1%}\n\t"           // load the item we will be summing onto.
-                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2%{%%k1%}\n\t"   // perform our multiply-add.
-                        "vmovdqa32\t\t%%zmm2,\t%[RES]%{%%k1}\n\t"            // save the result.
-                        : [RES]   "+m" (*dest)
-                        : [M]     "r"  (mask),
-                          [SRC]   "m"  (*src),
-                          [SCALE] "m"  (scaleVec)
-                        : "zmm0", "zmm1", "zmm2", "k1", "memory");
-}
-
-// perform a Fused Multiply Add of an I16x16 times scalar S into I32x16.
-inline static void GGML_I16x16_S_FMA_I32x16 (int16x16_t *src, int32_t scale, int32x16_t *dest)
-{
-  int32_t scaleVec[4] = {scale, scale, scale, scale};
-
-  __asm__ __volatile__ (
-                        "vmovdqa32\t\t%[SRC]%{sint16%},\t%%zmm0\n\t" // load the item we will be summing from. upscale it from int16.
-                        "vbroadcastI32x4\t%[SCALE],\t%%zmm1\n\t"     // load the item we will be multiplying by.
-                        "vmovdqa32\t\t%[RES],\t%%zmm2\n\t"           // load the item we will be summing onto.
-                        "vpmadd231d\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"   // perform our multiply-add.
-                        "vmovdqa32\t\t%%zmm2,\t%[RES]\n\t"           // save the result.
-                        : [RES]   "+m" (*dest)
-                        : [SRC]   "m"  (*src),
-                          [SCALE] "m"  (scaleVec)
-                        : "zmm0", "zmm1", "zmm2", "memory");
-}
-
+  
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -144,32 +186,26 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
   const uint8_t * scales = (const uint8_t*)&utmp[0];
   const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  float32x16_t sums __attribute__((aligned(64)));
-  int8x16_t aux8[QK_K/16] __attribute__((aligned(16)));
-  int16x16_t aux16[QK_K/16] __attribute__((aligned(32)));
-  int32x16_t aux32 __attribute__((aligned(64)));
+  float32x16_t sums;
 
+  // clear sums.
   GGML_F32x16_VEC_ZERO(&sums);
 
   float sumf = 0;
   for (int i = 0; i < nb; ++i) {
-    const uint8_t * restrict q4 = x[i].qs;
-    const uint8_t * restrict hm = x[i].qh;
-    const  int8_t * restrict q8 = y[i].qs;
+    int8x16_t q8copy [QK_K];
+    int32x16_t aux32;
+    uint8x16_t q4copyvec [QK_K/32];
+    uint8x16_t aux8 [QK_K/16];
 
-    int8_t * restrict a = (int8_t * restrict)aux8;
-    uint8_t m = 1;
+    // Fill in our 8 bit vector from y[]. required, because there is no good way to align members of y[], And I haven't mastered unaligned assembly yet...
+    memcpy (q8copy, y[i].qs, QK_K);
 
-    // Fill the 8 bit vector a with our 5 bit quantization data, 64 blocks at a time.
-    for (int j = 0; j < QK_K/64; ++j) {
-      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
-      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-      a += 32; m <<= 1;
-      for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4);
-      for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
-      a += 32; m <<= 1;
-      q4 += 32;
-    }
+    // Fill in our 4 bit vector from x[]. required, because there is no good way to align members of x[], And I haven't mastered unaligned assembly yet...
+    memcpy (q4copyvec, x[i].qs, QK_K/2);
+
+    // combine our 4 and 1 bit vector sets into an 8 bit value.
+    GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
 
     memcpy(utmp, x[i].scales, 12);
     utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -194,17 +230,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     }
 
     // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
-    for (int j = 0; j < QK_K/32; ++j) {
-      int32_t scale = scales[is++];
-      GGML_I16x16_S_FMA_I32x16 (&aux16[j*2], scale, &aux32);
-      GGML_I16x16_S_FMA_I32x16 (&aux16[(j*2)+1], scale, &aux32);
-    }
+    GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
     for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
+  
   for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From 8c17353717a297d39edb8cd6b6e7b7a5350f94d3 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 2 Apr 2024 16:55:40 +0000
Subject: [PATCH 056/105] minor changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 1145dfff7..02545057f 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -21,13 +21,9 @@
 #define GGML_F32_EPR 16
 
 /* we force an alignment, because i haven't written unaligned forms of the assembly functions, yet.. */
-typedef float float32x8_t __attribute__((vector_size (32)));
 typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
 typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
 typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
-typedef int16_t int16x8_t __attribute__((vector_size (16)));
-typedef int16_t int16x16_t __attribute__((vector_size (32)));
-typedef int32_t int32x8_t __attribute__((vector_size (32)));
 typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
 
 /* A forward declaration, to keep GCC happy. */
@@ -168,6 +164,8 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
 			);
 }
   
+// A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
+// Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -207,6 +205,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     // combine our 4 and 1 bit vector sets into an 8 bit value.
     GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
 
+    // extract scales and mins..
     memcpy(utmp, x[i].scales, 12);
     utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
     const uint32_t uaux = utmp[1] & kmask1;
@@ -220,24 +219,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     GGML_I32x16_VEC_ZERO(&aux32);
 
-    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-    int is = 0;
-    for (int j = 0; j < QK_K/32; ++j) {
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[j*2])[l] = q8[l] * a[l];
-      q8 += 16; a += 16;
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[(j*2)+1])[l] = q8[l] * a[l];
-      q8 += 16; a += 16;
-    }
-
     // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
     GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
+    int sumi = 0;
+    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
     for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  
+
   for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From 47190a7fe2fe405c4bb1047f950245237d91b46b Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 2 Apr 2024 17:01:53 +0000
Subject: [PATCH 057/105] formatting.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 02545057f..b8262b071 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -166,7 +166,7 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
 // Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
   const block_q5_K * restrict x = vx;

From 96fdd214c85789074838cabf1dd47c30c6ed993a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 3 Apr 2024 19:01:18 +0000
Subject: [PATCH 058/105] indent headers consistently.

---
 ggml-phi-knc-dot_q5_K_q8_K.h | 6 ++++--
 ggml-phi-knc.h               | 8 +++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
index b416803e0..e1e15d400 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.h
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -1,13 +1,15 @@
+// Formatted with: indent -npcs -nlp -i4 -l300
 #pragma once
 
 #include "ggml.h"
 
 #ifdef  __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 /* A forward declaration, to keep GCC happy. */
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
+    void ggml_vec_dot_q5_K_q8_K(int n, float *restrict s, size_t bs, const void *restrict vx, size_t bx, const void *restrict vy, size_t by, int nrc);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml-phi-knc.h b/ggml-phi-knc.h
index d2fd11428..94d444627 100644
--- a/ggml-phi-knc.h
+++ b/ggml-phi-knc.h
@@ -1,13 +1,15 @@
+// Formatted with: indent -npcs -nlp -i4 -l300
 #pragma once
 
 #include "ggml.h"
 
 #ifdef  __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-  /* A forward declaration, to keep GCC happy. */
-  void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
+    /* A forward declaration, to keep GCC happy. */
+    void ggml_vec_dot_f32(int n, float *restrict s, size_t bs, const float *restrict x, size_t bx, const float *restrict y, size_t by, int nrc);
 
 #ifdef  __cplusplus
 }

From 6f67ea886f276639d9c288018f587a420058f2e1 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 3 Apr 2024 20:24:00 +0000
Subject: [PATCH 059/105] formatting changes.

---
 ggml-phi-knc.c | 75 ++++++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 39 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index e767e2306..003c70b56 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -6,14 +6,11 @@
 // For memcpy.
 #include <string.h>
 
-// This SIMD unit can work with 32 float32s at once.
-#define GGML_F32_STEP 32
 // We can fit 16 of these float32s in a single vector register.
 #define GGML_F32_EPR 16
 
-// a single vector. 128*32=512
-typedef float float32x16_t __attribute__((vector_size (128)));
-#define GGML_F32x16              float32x16_t
+// A vector of 16 floats.
+typedef float float32x16_t __attribute__((vector_size (64), aligned (64)));
 
 // A forward declaration, to keep GCC happy...
 void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
@@ -23,7 +20,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t"        // use an upscaling operator to clear our value.
+                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
                         "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
                        : [RES]  "+m"  (*target)
                        : [Z]    "m"   (zero)
@@ -36,10 +33,10 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
   uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
   __asm__ __volatile__ (
-                        "mov\t%[ITER],%%r8\n\t"                     // how many register sized chunks are we responsible for
-                        "mov\t%[VEC1],%%r10\n\t"                    // where do we start work in mvec1?
-                        "mov\t%[VEC2],%%r12\n\t"                    // where do we start work in mvec2?
-                        "cmp\t$1,%[CLR]\n\t"                        // should we clear the sum before we start?
+                        "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
+                        "mov\t%[VEC1],%%r10\n\t"                      // where do we start work in mvec1?
+                        "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
+                        "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
                         "jne\t4f\n\t"
                         "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
                         "vprefetchnta\t(%%r10)\n\t"
@@ -55,47 +52,47 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                         "jmp\t1f\n\t"
                         "4:\n\t"
                         "vprefetch0\t(%[RES])\n\t"
-                        "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"          // otherwise, load our inital state from sum..
+                        "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // otherwise, load our inital state from sum..
                         "vprefetchnta\t(%%r10)\n\t"
                         "vprefetchnta\t(%%r12)\n\t"
                         "1:\n\t"
-                        "cmp\t$3,\t%%r8\n\t"                        // Compare iterations to three.
-                        "jnae\t6f\n\t"                              // If there are not three iterations left, jump to label 6.
-                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"           // Load two vectors.
+                        "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
+                        "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
+                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
                         "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
-                        "sub\t$3,\t%%r8\n\t"                        // Decrement iterations
-                        "vprefetchnta\t192(%%r10)\n\t"              // prefetch the next float32x16_t block (192 bytes ahead)
+                        "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
+                        "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
                         "vprefetchnta\t192(%%r12)\n\t"
-                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"         // Load two vectors.
+                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
                         "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
-                        "vprefetch1\t320(%%r10)\n\t"                // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
+                        "vprefetch1\t320(%%r10)\n\t"                  // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
                         "vprefetch1\t320(%%r12)\n\t"
-                        "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"        // Load two vectors.
+                        "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
                         "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
                         "vprefetch1\t576(%%r10)\n\t"
                         "vprefetch1\t576(%%r12)\n\t"
                         "vprefetch1\t704(%%r10)\n\t"
                         "vprefetch1\t704(%%r12)\n\t"
-                        "add\t$192,\t%%r10\n\t"                     // Move to the next float32x16_t block (192 bytes ahead)
+                        "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
                         "add\t$192,\t%%r12\n\t"
-                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add
-                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
-                        "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t" // Perform a fused multiply add
-                        "jmp\t1b\n\t"                               // Jump back to the start of the loop
-                        "6:\n\t"                                    // we know we are near the tail. handle 2, 1, and 0 cases.
-                        "cmp\t$0,\t%%r8\n\t"                        // Compare iterations to zero
-                        "je\t2f\n\t"                                // Jump to label 2 if zero (end of loop)
-                        "cmp\t$1,\t%%r8\n\t"                        // Compare iterations to one
-                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"           // Load two vectors.
+                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                        "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
+                        "jmp\t1b\n\t"                                 // Jump back to the start of the loop
+                        "6:\n\t"                                      // we know we are near the tail. handle 2, 1, and 0 cases.
+                        "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
+                        "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
+                        "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
+                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
                         "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
-                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t" // Perform a fused multiply add
-                        "je\t2f\n\t"                                // Jump to label 3 if one (end of loop)
-                                                                    // No compare. we must be two.
-                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"         // Load two vectors.
+                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+                        "je\t2f\n\t"                                  // Jump to label 3 if one (end of loop)
+                                                                      // No compare. we must be two.
+                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
                         "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
-                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t" // Perform a fused multiply add
-                        "2:\n\t"                                    // Label for loop end
-                        "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"        // save our results.
+                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                        "2:\n\t"                                      // Label for loop end
+                        "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
                         : [RES]  "+r" (sumvec)
                         : [ITER]  "r"  (iterations),
                           [VEC1]  "r"  (mvec1),
@@ -109,7 +106,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
 void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc)
 {
   // our sum.
-  float32x16_t sum __attribute__((aligned(64)));
+  float32x16_t sum;
 
   // the number of vector-sized steps we will need to do.
   const uint32_t np = (n & ~(GGML_F32_EPR - 1));
@@ -121,10 +118,10 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
     {
       // add the leftovers, that could not be handled by the vector loop.
       // our extended last part of x.
-      float32x16_t v1 __attribute__((aligned(64)));
+      float32x16_t v1;
       GGML_F32x16_VEC_ZERO(&v1);
       // our extended last part of y.
-      float32x16_t v2 __attribute__((aligned(64)));
+      float32x16_t v2;
       GGML_F32x16_VEC_ZERO(&v2);
 
       memcpy(&v1, &x[np], (n - np)*sizeof(float));

From 941257220519b6ad948bedb220778e3ca076c860 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 3 Apr 2024 20:30:25 +0000
Subject: [PATCH 060/105] add Makefile rule for generation .s file, for manual
 inspection.

---
 Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Makefile b/Makefile
index 42861f4b4..517aa168b 100644
--- a/Makefile
+++ b/Makefile
@@ -711,6 +711,9 @@ bench-phi-knc: bench-phi-knc.c ggml-phi-knc.o
 	$(CC) $(CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CC) $(CFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+ggml-phi-knc-dot_q5_K_q8_K.s: ggml-phi-knc-dot_q5_K_q8_K.c
+       $(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+
 infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

From 84df774d6a193d6487486bfe563338a6cdd6a200 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 3 Apr 2024 21:58:29 +0000
Subject: [PATCH 061/105] whoops. missing tab.

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 517aa168b..875bec482 100644
--- a/Makefile
+++ b/Makefile
@@ -712,7 +712,7 @@ bench-phi-knc: bench-phi-knc.c ggml-phi-knc.o
 	$(CC) $(CFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
 ggml-phi-knc-dot_q5_K_q8_K.s: ggml-phi-knc-dot_q5_K_q8_K.c
-       $(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
+	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
 
 infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

From 9ad5efafb020a48bc01e4a17a9d7c99e332190a4 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 3 Apr 2024 22:04:45 +0000
Subject: [PATCH 062/105] use GGML_F32_EPR, and remove some dead code.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index b8262b071..4c46fd2b3 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -6,13 +6,10 @@
 
 // Yes, we have to tell this header to actually export stuff.
 #define GGML_COMMON_IMPL_C
-#include "ggml-common.h"
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 
-// FIXME: why do we have to import this twice?
-#define GGML_COMMON_IMPL_C
-// For block_q5_K and block_q8_K. only given the second time.
+// For block_q5_K and block_q8_K.
 #include "ggml-common.h"
 
 // This SIMD unit can work with 32 float32s at once.
@@ -213,23 +210,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     utmp[2] = uaux;
     utmp[0] &= kmask1;
 
-    a = (int8_t * restrict)aux8;
-    
-    int sumi = 0;
-
-    GGML_I32x16_VEC_ZERO(&aux32);
-
     // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
     GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
     int sumi = 0;
     for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
+    for (int l = 0; l < GGML_F32_EPR; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
 
-  for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
+  for (int l = 0; l < GGML_F32_EPR; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }

From 9152143fe7635efc3e9825ab6aa872eda861315b Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 3 Apr 2024 23:21:24 +0000
Subject: [PATCH 063/105] reformat, and label what these files are.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 317 +++++++++++++++++------------------
 ggml-phi-knc.c               | 204 +++++++++++-----------
 2 files changed, 260 insertions(+), 261 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 4c46fd2b3..f7028991d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -1,3 +1,6 @@
+/* Xeon PHI IMCI support. */
+/* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+
 // For uint32_t
 #include <stdint.h>
 
@@ -29,198 +32,192 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 /* clear a vector of 16 floats. */
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  uint8_t zero=0;
+    uint8_t zero=0;
+
+    __asm__ __volatile__ (
+			  "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
+			  "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+			  : [RES]  "+m"  (*target)
+			  : [Z]    "m"   (zero)
+			  : "zmm8", "memory");
 
-  __asm__ __volatile__ (
-                        "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                        "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
-                        : [RES]  "+m"  (*target)
-                        : [Z]    "m"   (zero)
-                        : "zmm8", "memory");
 }
 
 // This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16.
 // it loops 8 times. well, actually four, with an unroll.
 inline static void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16 (int8x16_t *src11, uint8x16_t *src21, const uint8_t *scale, int32x16_t *res)
 {
-  uint8_t zero = 0;
+    uint8_t zero = 0;
 
-  __asm__ __volatile__ (
-			"vprefetche0\t(%[SRC11])\n\t"
-			"vprefetche0\t(%[SRC21])\n\t"
-			"vprefetche0\t(%[SCALE])\n\t"
-			"mov\t$0,\t%%ecx\n\t"
-			"mov\t%[SRC11],\t%%r12\n\t"
-			"mov\t%[SRC21],\t%%r8\n\t"
-			"mov\t%[SCALE],\t%%r9\n\t"
-			"vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"     // empty our result.
-
-			"1:\n\t"
-			"inc\t%%ecx\n\t"                               // we are in our loop, increment our counter.
-			"cmp\t$4,\t%%ecx\n\t"                          // see if this is our last run-through.
-			"vmovdqa32\t\t(%%r12)%{sint8%},\t%%zmm0\n\t"   // load the item we will be multiplying from. upscale it from int8 to int32.
-			"vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm1\n\t"    // load the item we will be multiplying with. upscale it from int8 to int32.
-			"vpmulld\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
-			"vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm6\n\t"   // load the item we will be multiplying by.
-			"vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			"vmovdqa32\t\t16(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			"vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-			"vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
-			"vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			"vmovdqa32\t\t32(%%r12)%{sint8%},\t%%zmm8\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			"vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm1\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-			"vpmulld\t%%zmm8,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
-			"vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm6\n\t"  // load the item we will be multiplying by.
-			"vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			"vmovdqa32\t\t48(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			"vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-			"vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
-			"vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			"je\t2f\n\t"                                   // if this is the last time through our loop, jump to 2.
-			"vprefetche0\t64(%%r12)\n\t"                   // otherwise, prepare for another run-through.
-			"vprefetche0\t64(%%r8)\n\t"
-			"vprefetche2\t128(%%r12)\n\t"
-			"vprefetche2\t128(%%r8)\n\t"
-			"add\t$64,\t%%r12\n\t"
-			"add\t$64,\t%%r8\n\t"
-			"add\t$2,\t%%r9\n\t"
-			"jmp\t1b\n\t"
-			"2:\n\t"
-			"vmovdqa32\t\t%%zmm7,\t(%[RES])\n\t"           // save the result.
-			: [RES]   "+r" (res)
-			: [SRC11] "r"  (src11),
-			  [SRC21] "r"  (src21),
-			  [SCALE] "r"  (scale),
-			  [Z]     "m"  (zero)
-			: "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "ecx", "r8", "r9", "r12", "memory");
+    __asm__ __volatile__ (
+			  "vprefetche0\t(%[SRC11])\n\t"
+			  "vprefetche0\t(%[SRC21])\n\t"
+			  "vprefetche0\t(%[SCALE])\n\t"
+			  "mov\t$0,\t%%ecx\n\t"
+			  "mov\t%[SRC11],\t%%r12\n\t"
+			  "mov\t%[SRC21],\t%%r8\n\t"
+			  "mov\t%[SCALE],\t%%r9\n\t"
+			  "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"     // empty our result.
+			  "1:\n\t"
+			  "inc\t%%ecx\n\t"                               // we are in our loop, increment our counter.
+			  "cmp\t$4,\t%%ecx\n\t"                          // see if this is our last run-through.
+			  "vmovdqa32\t\t(%%r12)%{sint8%},\t%%zmm0\n\t"   // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm1\n\t"    // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+			  "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm6\n\t"   // load the item we will be multiplying by.
+			  "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			  "vmovdqa32\t\t16(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+			  "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			  "vmovdqa32\t\t32(%%r12)%{sint8%},\t%%zmm8\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm1\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm8,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+			  "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm6\n\t"  // load the item we will be multiplying by.
+			  "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			  "vmovdqa32\t\t48(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+			  "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+			  "je\t2f\n\t"                                   // if this is the last time through our loop, jump to 2.
+			  "vprefetche0\t64(%%r12)\n\t"                   // otherwise, prepare for another run-through.
+			  "vprefetche0\t64(%%r8)\n\t"
+			  "vprefetche2\t128(%%r12)\n\t"
+			  "vprefetche2\t128(%%r8)\n\t"
+			  "add\t$64,\t%%r12\n\t"
+			  "add\t$64,\t%%r8\n\t"
+			  "add\t$2,\t%%r9\n\t"
+			  "jmp\t1b\n\t"
+			  "2:\n\t"
+			  "vmovdqa32\t\t%%zmm7,\t(%[RES])\n\t"           // save the result.
+			  : [RES]   "+r" (res)
+			  : [SRC11] "r"  (src11),
+			    [SRC21] "r"  (src21),
+			    [SCALE] "r"  (scale),
+			    [Z]     "m"  (zero)
+			  : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "r8", "r9", "r12", "memory");
 }
 
 // Unpack 256 unsigned 5 bit values into an 8 bit vector.
 inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
 {
-  uint8_t lowmask = 0x0F;
-  uint32_t allmask=0xFFFFFFFF;
-  uint8_t m=1;
-  uint8_t bit5 = 0x10;
+    uint8_t lowmask = 0x0F;
+    uint32_t allmask=0xFFFFFFFF;
+    uint8_t m=1;
+    uint8_t bit5 = 0x10;
 
-  __asm__ __volatile__ (
-			"vprefetche0\t(%[SRC1])\n\t"
-			"vprefetche0\t(%[SRC4])\n\t"
-			"vprefetche1\t64(%[SRC4])\n\t"
-			"mov\t%[SRC4],\t%%r12\n\t"                       // load the address of the head of our 4-bit list.
-			"mov\t%[DST],\t%%r8\n\t"                         // load the address of the head of our destination list.
-			"mov\t$0,%%ecx\n\t"                              // initialize our counter.
-			"vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm6\n\t"     // move 16 packed sets of single bits into the lower 8 bits of zmm6.
-			"vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm7\n\t"   // move the next 16 packed sets of single bits into the lower 8 bits of zmm7.
-			"vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t "   // load our mask.
-			"vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t"     // load the bit we want to add (conditionally).
-			"vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t"       // select which bit we want to test for.
-
-			"1:\n\t"
-			"inc\t%%ecx\n\t"                                 // we are in the loop. increment the counter.
-
-			"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
-			"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
-			"vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t"     // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-			"vpandd\t%%zmm0,\t%%zmm2,\t%%zmm4\n\t"           // apply a mask, storing the low four bits of vector zmm0 into zmm4.
-			"vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			"vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
-			"vmovdqa32\t\t16(%%r12)%{uint8%},\t%%zmm1\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-			"vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t"           // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
-			"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
-
-			"add\t$32,\t%%r8\n\t"
-			"cmp\t$4,\t%%ecx\n\t"
-			"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
-
-			"vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
-			"vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
-			"vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t"               // load our even 4 bit sequence into zmm4.
-			"vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			"vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
-			"vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t"               // load our even 4 bit sequence into zmm5.
-			"vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			"vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
-
-			"je\t2f\n\t"
-
-			"vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
-			"add\t$32,\t%%r12\n\t"
-			"add\t$32,\t%%r8\n\t"
-			"jmp\t1b\n\t"
-			"2:"
-			: [DST]  "+r" (dst)
-			: [SRC4]  "r" (q4),
-			  [SRC1]  "r" (q1),
-			  [MASK]  "m" (lowmask),
-			  [M]     "m" (m),
-			  [ALL]   "m" (allmask),
-			  [BIT5]  "m" (bit5)
-			: "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "ecx", "k1", "k2", "r12", "r8", "memory"
-			);
+    __asm__ __volatile__ (
+			  "vprefetche0\t(%[SRC1])\n\t"                     // Issue our memory requests first thing.
+			  "vprefetche0\t(%[SRC4])\n\t"
+			  "vprefetche1\t64(%[SRC4])\n\t"
+			  "mov\t%[SRC4],\t%%r12\n\t"                       // load the address of the head of our 4-bit list.
+			  "mov\t%[DST],\t%%r8\n\t"                         // load the address of the head of our destination list.
+			  "mov\t$0,%%ecx\n\t"                              // initialize our counter.
+			  "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm6\n\t"     // move 16 packed sets of single bits into the lower 8 bits of zmm6.
+			  "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm7\n\t"   // move the next 16 packed sets of single bits into the lower 8 bits of zmm7.
+			  "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t "   // load our mask.
+			  "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t"     // load the bit we want to add (conditionally).
+			  "vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t"       // select which bit we want to test for.
+			  "1:\n\t"
+			  "inc\t%%ecx\n\t"                                 // we are in the loop. increment the counter.
+			  "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+			  "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+			  "vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t"     // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+			  "vpandd\t%%zmm0,\t%%zmm2,\t%%zmm4\n\t"           // apply a mask, storing the low four bits of vector zmm0 into zmm4.
+			  "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			  "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+			  "vmovdqa32\t\t16(%%r12)%{uint8%},\t%%zmm1\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+			  "vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t"           // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
+			  "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			  "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+			  "add\t$32,\t%%r8\n\t"
+			  "cmp\t$4,\t%%ecx\n\t"
+			  "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+			  "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+			  "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+			  "vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t"               // load our even 4 bit sequence into zmm4.
+			  "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			  "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+			  "vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t"               // load our even 4 bit sequence into zmm5.
+			  "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+			  "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+			  "je\t2f\n\t"
+			  "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+			  "add\t$32,\t%%r12\n\t"
+			  "add\t$32,\t%%r8\n\t"
+			  "jmp\t1b\n\t"
+			  "2:"
+			  : [DST]  "+r" (dst)
+			  : [SRC4]  "r" (q4),
+			    [SRC1]  "r" (q1),
+			    [MASK]  "m" (lowmask),
+			    [M]     "m" (m),
+			    [ALL]   "m" (allmask),
+			    [BIT5]  "m" (bit5)
+			  : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "cc", "ecx", "k1", "k2", "r12", "r8", "memory"
+			  );
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
 // Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
 
-  /* interpret X and Y as vectors. */
-  const block_q5_K * restrict x = vx;
-  const block_q8_K * restrict y = vy;
+    /* interpret X and Y as vectors. */
+    const block_q5_K * restrict x = vx;
+    const block_q8_K * restrict y = vy;
 
-  /* the number of blocks we will process this in. */
-  const int nb = n / QK_K;
+    /* the number of blocks we will process this in. */
+    const int nb = n / QK_K;
 
-  static const uint32_t kmask1 = 0x3f3f3f3f;
-  static const uint32_t kmask2 = 0x0f0f0f0f;
-  static const uint32_t kmask3 = 0x03030303;
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
 
-  uint32_t utmp[4];
+    uint32_t utmp[4];
 
-  const uint8_t * scales = (const uint8_t*)&utmp[0];
-  const uint8_t * mins   = (const uint8_t*)&utmp[2];
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
 
-  float32x16_t sums;
+    float32x16_t sums;
 
-  // clear sums.
-  GGML_F32x16_VEC_ZERO(&sums);
+    // clear sums.
+    GGML_F32x16_VEC_ZERO(&sums);
 
-  float sumf = 0;
-  for (int i = 0; i < nb; ++i) {
-    int8x16_t q8copy [QK_K];
-    int32x16_t aux32;
-    uint8x16_t q4copyvec [QK_K/32];
-    uint8x16_t aux8 [QK_K/16];
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+	int8x16_t q8copy [QK_K];
+	int32x16_t aux32;
+	uint8x16_t q4copyvec [QK_K/32];
+	uint8x16_t aux8 [QK_K/16];
 
-    // Fill in our 8 bit vector from y[]. required, because there is no good way to align members of y[], And I haven't mastered unaligned assembly yet...
-    memcpy (q8copy, y[i].qs, QK_K);
+	// Fill in our 8 bit vector from y[]. required, because there is no good way to align members of y[], And I haven't mastered unaligned assembly yet...
+	memcpy (q8copy, y[i].qs, QK_K);
 
-    // Fill in our 4 bit vector from x[]. required, because there is no good way to align members of x[], And I haven't mastered unaligned assembly yet...
-    memcpy (q4copyvec, x[i].qs, QK_K/2);
+	// Fill in our 4 bit vector from x[]. required, because there is no good way to align members of x[], And I haven't mastered unaligned assembly yet...
+	memcpy (q4copyvec, x[i].qs, QK_K/2);
 
-    // combine our 4 and 1 bit vector sets into an 8 bit value.
-    GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
+	// combine our 4 and 1 bit vector sets into an 8 bit value.
+	GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
 
-    // extract scales and mins..
-    memcpy(utmp, x[i].scales, 12);
-    utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-    const uint32_t uaux = utmp[1] & kmask1;
-    utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-    utmp[2] = uaux;
-    utmp[0] &= kmask1;
+	// extract scales and mins..
+	memcpy(utmp, x[i].scales, 12);
+	utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+	const uint32_t uaux = utmp[1] & kmask1;
+	utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+	utmp[2] = uaux;
+	utmp[0] &= kmask1;
 
-    // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
-    GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
+	// FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
+	GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
-    int sumi = 0;
-    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-    const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-    for (int l = 0; l < GGML_F32_EPR; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
-    const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-    sumf -= dmin * sumi;
-  }
+	int sumi = 0;
+	for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+	const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+	for (int l = 0; l < GGML_F32_EPR; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
+	const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+	sumf -= dmin * sumi;
+    }
 
-  for (int l = 0; l < GGML_F32_EPR; ++l) sumf += ((float *)&sums)[l];
-  *s = sumf;
+    for (int l = 0; l < GGML_F32_EPR; ++l) sumf += ((float *)&sums)[l];
+    *s = sumf;
 }
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 003c70b56..c80bb13da 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,3 +1,6 @@
+/* Xeon PHI IMCI support. */
+/* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+
 #include <stdint.h>
 
 // For size_t
@@ -17,124 +20,123 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
 
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+    uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+
+    __asm__ __volatile__ (
+			  "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+			  "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
+			  : [RES]  "+m"  (*target)
+			  : [Z]    "m"   (zero)
+			  : "zmm8");
 
-  __asm__ __volatile__ (
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
-                        "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
-                       : [RES]  "+m"  (*target)
-                       : [Z]    "m"   (zero)
-                       : "zmm8");
 }
 
 // Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
 inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
 {
-  uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+    uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
-  __asm__ __volatile__ (
-                        "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
-                        "mov\t%[VEC1],%%r10\n\t"                      // where do we start work in mvec1?
-                        "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
-                        "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
-                        "jne\t4f\n\t"
-                        "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
-                        "vprefetchnta\t(%%r10)\n\t"
-                        "vprefetchnta\t(%%r12)\n\t"
-                        "vprefetch1\t128(%%r10)\n\t"
-                        "vprefetch1\t128(%%r12)\n\t"
-                        "vprefetch1\t256(%%r10)\n\t"
-                        "vprefetch1\t256(%%r12)\n\t"
-                        "vprefetch1\t384(%%r10)\n\t"
-                        "vprefetch1\t384(%%r12)\n\t"
-                        "vprefetch1\t512(%%r10)\n\t"
-                        "vprefetch1\t512(%%r12)\n\t"
-                        "jmp\t1f\n\t"
-                        "4:\n\t"
-                        "vprefetch0\t(%[RES])\n\t"
-                        "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // otherwise, load our inital state from sum..
-                        "vprefetchnta\t(%%r10)\n\t"
-                        "vprefetchnta\t(%%r12)\n\t"
-                        "1:\n\t"
-                        "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
-                        "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
-                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
-                        "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
-                        "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
-                        "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
-                        "vprefetchnta\t192(%%r12)\n\t"
-                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
-                        "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
-                        "vprefetch1\t320(%%r10)\n\t"                  // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
-                        "vprefetch1\t320(%%r12)\n\t"
-                        "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
-                        "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
-                        "vprefetch1\t576(%%r10)\n\t"
-                        "vprefetch1\t576(%%r12)\n\t"
-                        "vprefetch1\t704(%%r10)\n\t"
-                        "vprefetch1\t704(%%r12)\n\t"
-                        "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
-                        "add\t$192,\t%%r12\n\t"
-                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
-                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
-                        "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
-                        "jmp\t1b\n\t"                                 // Jump back to the start of the loop
-                        "6:\n\t"                                      // we know we are near the tail. handle 2, 1, and 0 cases.
-                        "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
-                        "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
-                        "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
-                        "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
-                        "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
-                        "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
-                        "je\t2f\n\t"                                  // Jump to label 3 if one (end of loop)
-                                                                      // No compare. we must be two.
-                        "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
-                        "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
-                        "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
-                        "2:\n\t"                                      // Label for loop end
-                        "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
-                        : [RES]  "+r" (sumvec)
-                        : [ITER]  "r"  (iterations),
-                          [VEC1]  "r"  (mvec1),
-                          [VEC2]  "r"  (mvec2),
-                          [CLR]   "r"  (clear),
-                          [Z]     "m"  (zero)
-                        : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
+    __asm__ __volatile__ (
+			  "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
+			  "mov\t%[VEC1],%%r10\n\t"                      // where do we start work in mvec1?
+			  "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
+			  "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
+			  "jne\t4f\n\t"
+			  "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
+			  "vprefetchnta\t(%%r10)\n\t"
+			  "vprefetchnta\t(%%r12)\n\t"
+			  "vprefetch1\t128(%%r10)\n\t"
+			  "vprefetch1\t128(%%r12)\n\t"
+			  "vprefetch1\t256(%%r10)\n\t"
+			  "vprefetch1\t256(%%r12)\n\t"
+			  "vprefetch1\t384(%%r10)\n\t"
+			  "vprefetch1\t384(%%r12)\n\t"
+			  "vprefetch1\t512(%%r10)\n\t"
+			  "vprefetch1\t512(%%r12)\n\t"
+			  "jmp\t1f\n\t"
+			  "4:\n\t"
+			  "vprefetch0\t(%[RES])\n\t"
+			  "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // otherwise, load our inital state from sum..
+			  "vprefetchnta\t(%%r10)\n\t"
+			  "vprefetchnta\t(%%r12)\n\t"
+			  "1:\n\t"
+			  "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
+			  "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
+			  "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
+			  "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+			  "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
+			  "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
+			  "vprefetchnta\t192(%%r12)\n\t"
+			  "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
+			  "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+			  "vprefetch1\t320(%%r10)\n\t"                  // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
+			  "vprefetch1\t320(%%r12)\n\t"
+			  "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
+			  "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
+			  "vprefetch1\t576(%%r10)\n\t"
+			  "vprefetch1\t576(%%r12)\n\t"
+			  "vprefetch1\t704(%%r10)\n\t"
+			  "vprefetch1\t704(%%r12)\n\t"
+			  "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
+			  "add\t$192,\t%%r12\n\t"
+			  "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+			  "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+			  "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
+			  "jmp\t1b\n\t"                                 // Jump back to the start of the loop
+			  "6:\n\t"                                      // we know we are near the tail. handle 2, 1, and 0 cases.
+			  "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
+			  "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
+			  "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
+			  "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
+			  "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+			  "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+			  "je\t2f\n\t"                                  // Jump to label 3 if one (end of loop)
+			  // No compare. we must be two.
+			  "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
+			  "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+			  "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+			  "2:\n\t"                                      // Label for loop end
+			  "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+			  : [RES]  "+r" (sumvec)
+			  : [ITER]  "r"  (iterations),
+			    [VEC1]  "r"  (mvec1),
+			    [VEC2]  "r"  (mvec2),
+			    [CLR]   "r"  (clear),
+			    [Z]     "m"  (zero)
+			  : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
 }
 
 // NOTE: x and y inputs must be __attribute__((aligned(64)));
 void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc)
 {
-  // our sum.
-  float32x16_t sum;
+    // our sum.
+    float32x16_t sum;
 
-  // the number of vector-sized steps we will need to do.
-  const uint32_t np = (n & ~(GGML_F32_EPR - 1));
+    // the number of vector-sized steps we will need to do.
+    const uint32_t np = (n & ~(GGML_F32_EPR - 1));
 
-  GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1);
+    GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1);
 
-  // FIXME: replace this with a final round using masked vectors.
-  if ( n - np != 0 )
-    {
-      // add the leftovers, that could not be handled by the vector loop.
-      // our extended last part of x.
-      float32x16_t v1;
-      GGML_F32x16_VEC_ZERO(&v1);
-      // our extended last part of y.
-      float32x16_t v2;
-      GGML_F32x16_VEC_ZERO(&v2);
+    // add the leftovers, that could not be handled by the vector loop.
+    if ( n - np != 0 )
+	{
+	    // our extended last part of x.
+	    float32x16_t v1;
+	    GGML_F32x16_VEC_ZERO(&v1);
+	    // our extended last part of y.
+	    float32x16_t v2;
+	    GGML_F32x16_VEC_ZERO(&v2);
 
-      memcpy(&v1, &x[np], (n - np)*sizeof(float));
-      memcpy(&v2, &y[np], (n - np)*sizeof(float));
+	    memcpy(&v1, &x[np], (n - np)*sizeof(float));
+	    memcpy(&v2, &y[np], (n - np)*sizeof(float));
 
-      GGML_F32x16_VEC_FMA(&v1,
-                          &v2,
-                          &sum, 1, 0);
+	    GGML_F32x16_VEC_FMA(&v1,
+				&v2,
+				&sum, 1, 0);
+	}
 
-    }
-
-  // reduce sum, and store it in s.
-  for (uint32_t i=0; i <GGML_F32_EPR; ++i)
-    *s+=((float *)&sum)[i];
+    // reduce sum, and store it in s.
+    for (uint32_t i=0; i <GGML_F32_EPR; ++i)
+	*s+=((float *)&sum)[i];
 
 }

From 53773e0b4a39fee5889669eb550f53c1593b716f Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 3 Apr 2024 23:42:34 +0000
Subject: [PATCH 064/105] replace tabs with spaces.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 250 +++++++++++++++++------------------
 ggml-phi-knc.c               | 172 ++++++++++++------------
 2 files changed, 211 insertions(+), 211 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index f7028991d..3b170d2dc 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -35,11 +35,11 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
     uint8_t zero=0;
 
     __asm__ __volatile__ (
-			  "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-			  "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
-			  : [RES]  "+m"  (*target)
-			  : [Z]    "m"   (zero)
-			  : "zmm8", "memory");
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
+                          "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                          : [RES]  "+m"  (*target)
+                          : [Z]    "m"   (zero)
+                          : "zmm8", "memory");
 
 }
 
@@ -50,52 +50,52 @@ inline static void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16 (int8x16_t
     uint8_t zero = 0;
 
     __asm__ __volatile__ (
-			  "vprefetche0\t(%[SRC11])\n\t"
-			  "vprefetche0\t(%[SRC21])\n\t"
-			  "vprefetche0\t(%[SCALE])\n\t"
-			  "mov\t$0,\t%%ecx\n\t"
-			  "mov\t%[SRC11],\t%%r12\n\t"
-			  "mov\t%[SRC21],\t%%r8\n\t"
-			  "mov\t%[SCALE],\t%%r9\n\t"
-			  "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"     // empty our result.
-			  "1:\n\t"
-			  "inc\t%%ecx\n\t"                               // we are in our loop, increment our counter.
-			  "cmp\t$4,\t%%ecx\n\t"                          // see if this is our last run-through.
-			  "vmovdqa32\t\t(%%r12)%{sint8%},\t%%zmm0\n\t"   // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm1\n\t"    // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
-			  "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm6\n\t"   // load the item we will be multiplying by.
-			  "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			  "vmovdqa32\t\t16(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
-			  "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			  "vmovdqa32\t\t32(%%r12)%{sint8%},\t%%zmm8\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm1\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm8,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
-			  "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm6\n\t"  // load the item we will be multiplying by.
-			  "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			  "vmovdqa32\t\t48(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
-			  "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-			  "je\t2f\n\t"                                   // if this is the last time through our loop, jump to 2.
-			  "vprefetche0\t64(%%r12)\n\t"                   // otherwise, prepare for another run-through.
-			  "vprefetche0\t64(%%r8)\n\t"
-			  "vprefetche2\t128(%%r12)\n\t"
-			  "vprefetche2\t128(%%r8)\n\t"
-			  "add\t$64,\t%%r12\n\t"
-			  "add\t$64,\t%%r8\n\t"
-			  "add\t$2,\t%%r9\n\t"
-			  "jmp\t1b\n\t"
-			  "2:\n\t"
-			  "vmovdqa32\t\t%%zmm7,\t(%[RES])\n\t"           // save the result.
-			  : [RES]   "+r" (res)
-			  : [SRC11] "r"  (src11),
-			    [SRC21] "r"  (src21),
-			    [SCALE] "r"  (scale),
-			    [Z]     "m"  (zero)
-			  : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "r8", "r9", "r12", "memory");
+                          "vprefetche0\t(%[SRC11])\n\t"
+                          "vprefetche0\t(%[SRC21])\n\t"
+                          "vprefetche0\t(%[SCALE])\n\t"
+                          "mov\t$0,\t%%ecx\n\t"
+                          "mov\t%[SRC11],\t%%r12\n\t"
+                          "mov\t%[SRC21],\t%%r8\n\t"
+                          "mov\t%[SCALE],\t%%r9\n\t"
+                          "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"     // empty our result.
+                          "1:\n\t"
+                          "inc\t%%ecx\n\t"                               // we are in our loop, increment our counter.
+                          "cmp\t$4,\t%%ecx\n\t"                          // see if this is our last run-through.
+                          "vmovdqa32\t\t(%%r12)%{sint8%},\t%%zmm0\n\t"   // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm1\n\t"    // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+                          "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm6\n\t"   // load the item we will be multiplying by.
+                          "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+                          "vmovdqa32\t\t16(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+                          "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+                          "vmovdqa32\t\t32(%%r12)%{sint8%},\t%%zmm8\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm1\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm8,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
+                          "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm6\n\t"  // load the item we will be multiplying by.
+                          "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+                          "vmovdqa32\t\t48(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
+                          "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
+                          "je\t2f\n\t"                                   // if this is the last time through our loop, jump to 2.
+                          "vprefetche0\t64(%%r12)\n\t"                   // otherwise, prepare for another run-through.
+                          "vprefetche0\t64(%%r8)\n\t"
+                          "vprefetche2\t128(%%r12)\n\t"
+                          "vprefetche2\t128(%%r8)\n\t"
+                          "add\t$64,\t%%r12\n\t"
+                          "add\t$64,\t%%r8\n\t"
+                          "add\t$2,\t%%r9\n\t"
+                          "jmp\t1b\n\t"
+                          "2:\n\t"
+                          "vmovdqa32\t\t%%zmm7,\t(%[RES])\n\t"           // save the result.
+                          : [RES]   "+r" (res)
+                          : [SRC11] "r"  (src11),
+                            [SRC21] "r"  (src21),
+                            [SCALE] "r"  (scale),
+                            [Z]     "m"  (zero)
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "r8", "r9", "r12", "memory");
 }
 
 // Unpack 256 unsigned 5 bit values into an 8 bit vector.
@@ -107,55 +107,55 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
     uint8_t bit5 = 0x10;
 
     __asm__ __volatile__ (
-			  "vprefetche0\t(%[SRC1])\n\t"                     // Issue our memory requests first thing.
-			  "vprefetche0\t(%[SRC4])\n\t"
-			  "vprefetche1\t64(%[SRC4])\n\t"
-			  "mov\t%[SRC4],\t%%r12\n\t"                       // load the address of the head of our 4-bit list.
-			  "mov\t%[DST],\t%%r8\n\t"                         // load the address of the head of our destination list.
-			  "mov\t$0,%%ecx\n\t"                              // initialize our counter.
-			  "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm6\n\t"     // move 16 packed sets of single bits into the lower 8 bits of zmm6.
-			  "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm7\n\t"   // move the next 16 packed sets of single bits into the lower 8 bits of zmm7.
-			  "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t "   // load our mask.
-			  "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t"     // load the bit we want to add (conditionally).
-			  "vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t"       // select which bit we want to test for.
-			  "1:\n\t"
-			  "inc\t%%ecx\n\t"                                 // we are in the loop. increment the counter.
-			  "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
-			  "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
-			  "vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t"     // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-			  "vpandd\t%%zmm0,\t%%zmm2,\t%%zmm4\n\t"           // apply a mask, storing the low four bits of vector zmm0 into zmm4.
-			  "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			  "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
-			  "vmovdqa32\t\t16(%%r12)%{uint8%},\t%%zmm1\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-			  "vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t"           // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
-			  "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			  "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
-			  "add\t$32,\t%%r8\n\t"
-			  "cmp\t$4,\t%%ecx\n\t"
-			  "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
-			  "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
-			  "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
-			  "vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t"               // load our even 4 bit sequence into zmm4.
-			  "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			  "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
-			  "vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t"               // load our even 4 bit sequence into zmm5.
-			  "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-			  "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
-			  "je\t2f\n\t"
-			  "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
-			  "add\t$32,\t%%r12\n\t"
-			  "add\t$32,\t%%r8\n\t"
-			  "jmp\t1b\n\t"
-			  "2:"
-			  : [DST]  "+r" (dst)
-			  : [SRC4]  "r" (q4),
-			    [SRC1]  "r" (q1),
-			    [MASK]  "m" (lowmask),
-			    [M]     "m" (m),
-			    [ALL]   "m" (allmask),
-			    [BIT5]  "m" (bit5)
-			  : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "cc", "ecx", "k1", "k2", "r12", "r8", "memory"
-			  );
+                          "vprefetche0\t(%[SRC1])\n\t"                     // Issue our memory requests first thing.
+                          "vprefetche0\t(%[SRC4])\n\t"
+                          "vprefetche1\t64(%[SRC4])\n\t"
+                          "mov\t%[SRC4],\t%%r12\n\t"                       // load the address of the head of our 4-bit list.
+                          "mov\t%[DST],\t%%r8\n\t"                         // load the address of the head of our destination list.
+                          "mov\t$0,%%ecx\n\t"                              // initialize our counter.
+                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm6\n\t"     // move 16 packed sets of single bits into the lower 8 bits of zmm6.
+                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm7\n\t"   // move the next 16 packed sets of single bits into the lower 8 bits of zmm7.
+                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t "   // load our mask.
+                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t"     // load the bit we want to add (conditionally).
+                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t"       // select which bit we want to test for.
+                          "1:\n\t"
+                          "inc\t%%ecx\n\t"                                 // we are in the loop. increment the counter.
+                          "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+                          "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+                          "vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t"     // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm2,\t%%zmm4\n\t"           // apply a mask, storing the low four bits of vector zmm0 into zmm4.
+                          "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+                          "vmovdqa32\t\t16(%%r12)%{uint8%},\t%%zmm1\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t"           // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
+                          "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+                          "add\t$32,\t%%r8\n\t"
+                          "cmp\t$4,\t%%ecx\n\t"
+                          "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+                          "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
+                          "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
+                          "vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t"               // load our even 4 bit sequence into zmm4.
+                          "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
+                          "vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t"               // load our even 4 bit sequence into zmm5.
+                          "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+                          "je\t2f\n\t"
+                          "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
+                          "add\t$32,\t%%r12\n\t"
+                          "add\t$32,\t%%r8\n\t"
+                          "jmp\t1b\n\t"
+                          "2:"
+                          : [DST]  "+r" (dst)
+                          : [SRC4]  "r" (q4),
+                            [SRC1]  "r" (q1),
+                            [MASK]  "m" (lowmask),
+                            [M]     "m" (m),
+                            [ALL]   "m" (allmask),
+                            [BIT5]  "m" (bit5)
+                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "cc", "ecx", "k1", "k2", "r12", "r8", "memory"
+                          );
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
@@ -185,37 +185,37 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-	int8x16_t q8copy [QK_K];
-	int32x16_t aux32;
-	uint8x16_t q4copyvec [QK_K/32];
-	uint8x16_t aux8 [QK_K/16];
+        int8x16_t q8copy [QK_K];
+        int32x16_t aux32;
+        uint8x16_t q4copyvec [QK_K/32];
+        uint8x16_t aux8 [QK_K/16];
 
-	// Fill in our 8 bit vector from y[]. required, because there is no good way to align members of y[], And I haven't mastered unaligned assembly yet...
-	memcpy (q8copy, y[i].qs, QK_K);
+        // Fill in our 8 bit vector from y[]. required, because there is no good way to align members of y[], And I haven't mastered unaligned assembly yet...
+        memcpy (q8copy, y[i].qs, QK_K);
 
-	// Fill in our 4 bit vector from x[]. required, because there is no good way to align members of x[], And I haven't mastered unaligned assembly yet...
-	memcpy (q4copyvec, x[i].qs, QK_K/2);
+        // Fill in our 4 bit vector from x[]. required, because there is no good way to align members of x[], And I haven't mastered unaligned assembly yet...
+        memcpy (q4copyvec, x[i].qs, QK_K/2);
 
-	// combine our 4 and 1 bit vector sets into an 8 bit value.
-	GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
+        // combine our 4 and 1 bit vector sets into an 8 bit value.
+        GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
 
-	// extract scales and mins..
-	memcpy(utmp, x[i].scales, 12);
-	utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
-	const uint32_t uaux = utmp[1] & kmask1;
-	utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
-	utmp[2] = uaux;
-	utmp[0] &= kmask1;
+        // extract scales and mins..
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
 
-	// FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
-	GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
+        // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
+        GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
-	int sumi = 0;
-	for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-	const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-	for (int l = 0; l < GGML_F32_EPR; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
-	const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
-	sumf -= dmin * sumi;
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < GGML_F32_EPR; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
+        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
     }
 
     for (int l = 0; l < GGML_F32_EPR; ++l) sumf += ((float *)&sums)[l];
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index c80bb13da..710da27a6 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -23,11 +23,11 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
     uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
     __asm__ __volatile__ (
-			  "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
-			  "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
-			  : [RES]  "+m"  (*target)
-			  : [Z]    "m"   (zero)
-			  : "zmm8");
+                          "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                          "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
+                          : [RES]  "+m"  (*target)
+                          : [Z]    "m"   (zero)
+                          : "zmm8");
 
 }
 
@@ -37,73 +37,73 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
     uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
 
     __asm__ __volatile__ (
-			  "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
-			  "mov\t%[VEC1],%%r10\n\t"                      // where do we start work in mvec1?
-			  "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
-			  "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
-			  "jne\t4f\n\t"
-			  "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
-			  "vprefetchnta\t(%%r10)\n\t"
-			  "vprefetchnta\t(%%r12)\n\t"
-			  "vprefetch1\t128(%%r10)\n\t"
-			  "vprefetch1\t128(%%r12)\n\t"
-			  "vprefetch1\t256(%%r10)\n\t"
-			  "vprefetch1\t256(%%r12)\n\t"
-			  "vprefetch1\t384(%%r10)\n\t"
-			  "vprefetch1\t384(%%r12)\n\t"
-			  "vprefetch1\t512(%%r10)\n\t"
-			  "vprefetch1\t512(%%r12)\n\t"
-			  "jmp\t1f\n\t"
-			  "4:\n\t"
-			  "vprefetch0\t(%[RES])\n\t"
-			  "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // otherwise, load our inital state from sum..
-			  "vprefetchnta\t(%%r10)\n\t"
-			  "vprefetchnta\t(%%r12)\n\t"
-			  "1:\n\t"
-			  "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
-			  "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
-			  "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
-			  "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
-			  "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
-			  "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
-			  "vprefetchnta\t192(%%r12)\n\t"
-			  "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
-			  "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
-			  "vprefetch1\t320(%%r10)\n\t"                  // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
-			  "vprefetch1\t320(%%r12)\n\t"
-			  "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
-			  "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
-			  "vprefetch1\t576(%%r10)\n\t"
-			  "vprefetch1\t576(%%r12)\n\t"
-			  "vprefetch1\t704(%%r10)\n\t"
-			  "vprefetch1\t704(%%r12)\n\t"
-			  "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
-			  "add\t$192,\t%%r12\n\t"
-			  "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
-			  "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
-			  "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
-			  "jmp\t1b\n\t"                                 // Jump back to the start of the loop
-			  "6:\n\t"                                      // we know we are near the tail. handle 2, 1, and 0 cases.
-			  "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
-			  "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
-			  "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
-			  "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
-			  "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
-			  "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
-			  "je\t2f\n\t"                                  // Jump to label 3 if one (end of loop)
-			  // No compare. we must be two.
-			  "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
-			  "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
-			  "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
-			  "2:\n\t"                                      // Label for loop end
-			  "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
-			  : [RES]  "+r" (sumvec)
-			  : [ITER]  "r"  (iterations),
-			    [VEC1]  "r"  (mvec1),
-			    [VEC2]  "r"  (mvec2),
-			    [CLR]   "r"  (clear),
-			    [Z]     "m"  (zero)
-			  : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
+                          "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
+                          "mov\t%[VEC1],%%r10\n\t"                      // where do we start work in mvec1?
+                          "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
+                          "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
+                          "jne\t4f\n\t"
+                          "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
+                          "vprefetchnta\t(%%r10)\n\t"
+                          "vprefetchnta\t(%%r12)\n\t"
+                          "vprefetch1\t128(%%r10)\n\t"
+                          "vprefetch1\t128(%%r12)\n\t"
+                          "vprefetch1\t256(%%r10)\n\t"
+                          "vprefetch1\t256(%%r12)\n\t"
+                          "vprefetch1\t384(%%r10)\n\t"
+                          "vprefetch1\t384(%%r12)\n\t"
+                          "vprefetch1\t512(%%r10)\n\t"
+                          "vprefetch1\t512(%%r12)\n\t"
+                          "jmp\t1f\n\t"
+                          "4:\n\t"
+                          "vprefetch0\t(%[RES])\n\t"
+                          "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // otherwise, load our inital state from sum..
+                          "vprefetchnta\t(%%r10)\n\t"
+                          "vprefetchnta\t(%%r12)\n\t"
+                          "1:\n\t"
+                          "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
+                          "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
+                          "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
+                          "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                          "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
+                          "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
+                          "vprefetchnta\t192(%%r12)\n\t"
+                          "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
+                          "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                          "vprefetch1\t320(%%r10)\n\t"                  // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
+                          "vprefetch1\t320(%%r12)\n\t"
+                          "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
+                          "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
+                          "vprefetch1\t576(%%r10)\n\t"
+                          "vprefetch1\t576(%%r12)\n\t"
+                          "vprefetch1\t704(%%r10)\n\t"
+                          "vprefetch1\t704(%%r12)\n\t"
+                          "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
+                          "add\t$192,\t%%r12\n\t"
+                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "jmp\t1b\n\t"                                 // Jump back to the start of the loop
+                          "6:\n\t"                                      // we know we are near the tail. handle 2, 1, and 0 cases.
+                          "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
+                          "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
+                          "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
+                          "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
+                          "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
+                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "je\t2f\n\t"                                  // Jump to label 3 if one (end of loop)
+                          // No compare. we must be two.
+                          "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
+                          "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
+                          "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "2:\n\t"                                      // Label for loop end
+                          "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+                          : [RES]  "+r" (sumvec)
+                          : [ITER]  "r"  (iterations),
+                            [VEC1]  "r"  (mvec1),
+                            [VEC2]  "r"  (mvec2),
+                            [CLR]   "r"  (clear),
+                            [Z]     "m"  (zero)
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
 }
 
 // NOTE: x and y inputs must be __attribute__((aligned(64)));
@@ -119,24 +119,24 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
 
     // add the leftovers, that could not be handled by the vector loop.
     if ( n - np != 0 )
-	{
-	    // our extended last part of x.
-	    float32x16_t v1;
-	    GGML_F32x16_VEC_ZERO(&v1);
-	    // our extended last part of y.
-	    float32x16_t v2;
-	    GGML_F32x16_VEC_ZERO(&v2);
+        {
+            // our extended last part of x.
+            float32x16_t v1;
+            GGML_F32x16_VEC_ZERO(&v1);
+            // our extended last part of y.
+            float32x16_t v2;
+            GGML_F32x16_VEC_ZERO(&v2);
 
-	    memcpy(&v1, &x[np], (n - np)*sizeof(float));
-	    memcpy(&v2, &y[np], (n - np)*sizeof(float));
+            memcpy(&v1, &x[np], (n - np)*sizeof(float));
+            memcpy(&v2, &y[np], (n - np)*sizeof(float));
 
-	    GGML_F32x16_VEC_FMA(&v1,
-				&v2,
-				&sum, 1, 0);
-	}
+            GGML_F32x16_VEC_FMA(&v1,
+                                &v2,
+                                &sum, 1, 0);
+        }
 
     // reduce sum, and store it in s.
     for (uint32_t i=0; i <GGML_F32_EPR; ++i)
-	*s+=((float *)&sum)[i];
+        *s+=((float *)&sum)[i];
 
 }

From e298d9e65e0fdc5bfe9024dc16c7506c29f020a7 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Mon, 22 Apr 2024 18:16:28 +0000
Subject: [PATCH 065/105] further optimizations. 0.99 tokens per second.

---
 Makefile                     |   2 +-
 bench-phi-knc.c              | 175 +++++++++++++++++--
 ggml-phi-knc-dot_q5_K_q8_K.c | 315 ++++++++++++++++++++++-------------
 ggml-phi-knc-dot_q5_K_q8_K.h |  18 +-
 4 files changed, 379 insertions(+), 131 deletions(-)

diff --git a/Makefile b/Makefile
index 875bec482..7ed51cd02 100644
--- a/Makefile
+++ b/Makefile
@@ -707,7 +707,7 @@ bench-phi-knc.s: bench-phi-knc.c
 ggml-phi-knc.s: ggml-phi-knc.c
 	$(CC) $(CFLAGS) -S $< -o $(call GET_ASM_FILE, $<)
 
-bench-phi-knc: bench-phi-knc.c ggml-phi-knc.o
+bench-phi-knc: bench-phi-knc.c ggml-phi-knc.o ggml-phi-knc-dot_q5_K_q8_K.o
 	$(CC) $(CFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CC) $(CFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index a59e2e5b7..0f8efc833 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -1,33 +1,52 @@
+/* bench-phi-knc.c: benchmarks and tests for the Xeon PHI Knights Corner optimizations. */
+
 #include <immintrin.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
-#include <unistd.h> /*for CLOCK_REALTIME? */
+
+/* For CLOCK_REALTIME? */
+#include <unistd.h>
 #include <time.h>
 
+/* For memcpy */
+#include <string.h>
+
+/* include the increasingly inacurately named header for our F32 dot product code. */
 #include "ggml-phi-knc.h"
 
-#define MAXVEC 1024768
-#define RUNTOTAL 12
-#define RUNS
+/* include the header for our Q8K_Q5K dot product code. */
+#include "ggml-phi-knc-dot_q5_K_q8_K.h"
+
+// largest Float32 vectors to get the dot product of.
+#define F32_MAXVEC 1024768
+// how many benchmarks we will run in total.
+#define F32_RUNCOUNT 12
+#define F32_ITEMS_PER_RUN {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768}
+
 int main(void)
 {
-  struct timespec start, middle, end;
-  double vector_time;
-  double scalar_time;
-  float scalar = 0.0f;
-  float vector = 0.0f;
-  int vecRuns[RUNTOTAL] = {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768};
+  int vecRuns[F32_RUNCOUNT] = F32_ITEMS_PER_RUN;
 
-  for (uint32_t runCount = 0; runCount < RUNTOTAL; ++runCount)
+  // seed the random number generator.
+  srand(time(NULL));
+
+  // Run benchmarks for our F32 dot product functions. Benchmark them against a naieve implementation.
+  for (uint8_t runCount = 0; runCount < F32_RUNCOUNT; ++runCount)
     {
+      struct timespec start, middle, end;
+      double vector_time;
+      double scalar_time;
+      float scalar = 0.0f;
+      float vector = 0.0f;
+
       // Generate random input vector of [-1, 1] values.
-      float vec1[MAXVEC] __attribute__((aligned(64)));
+      float vec1[F32_MAXVEC] __attribute__((aligned(64)));
       for (int i = 0; i < vecRuns[runCount]; i++)
         vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
 
       // Generate a second random input vector of [-1, 1] values.
-      float vec2[MAXVEC] __attribute__((aligned(64)));
+      float vec2[F32_MAXVEC] __attribute__((aligned(64)));
       for (int i = 0; i < vecRuns[runCount]; i++)
         vec2[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
 
@@ -60,5 +79,135 @@ int main(void)
 
   fflush(stdout);
 
+  // Generate a random input vector of 256 4 bit values.
+  uint8x16_t q4[8];
+  uint8_t * q4ptr = (uint8_t *)q4;
+  for (int i = 0; i < 128; i++)
+    q4ptr[i] = rand() && 0xFF;
+
+  // Generate a random input vector of 256 1 bit values.
+  uint8x16_t q1[2];
+  uint8_t * q1ptr = (uint8_t *)q1;
+  for (int i = 0; i < 32; i++)
+    q1ptr[i] = rand() && 0xFF;
+
+  // Get our reference, unshifted result.
+  uint8x16_t q5[16];
+  GGML_5bit_Unpack_Unaligned(q4, (uint8_t *)q1, q5);
+
+  printf("successfully got a Q5.\n");
+
+  // Perform alignment tests, for GGML_5bit_Unpack_Unaligned.
+  // Try to run GGML_5bit_Unpack_Unaligned with all possible misalignments, and get it to fail.
+  for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
+    {
+      uint8x16_t q5new[16];
+      uint8x16_t q4Shifted[9];
+
+      // create an off-by-shiftCount copy of q4.
+      q4ptr = ((uint8_t *)q4Shifted) + shiftCount;
+      memcpy (q4ptr, q4, 128);
+
+      // call the unaligned form of this function:
+      GGML_5bit_Unpack_Unaligned((uint8x16_t *)q4ptr, (uint8_t *)q1, q5new);
+
+      for (uint32_t byteCount = 0; byteCount < 256; ++byteCount)
+       {
+         if ( ((uint8_t *)q5new)[byteCount] != ((uint8_t *)q5)[byteCount] )
+           {
+             printf("whoops!\nshiftCount: %d\nbyteCount: %d\n", shiftCount, byteCount);
+             exit (-1);
+           }
+       }
+
+      printf("Got a Q5 offset by %d\n", shiftCount);
+    }
+
+  // Generate a random input vector of 256 8 bit values.
+  int8x16_t q8[16];
+  int8_t * q8ptr = (int8_t *)q8;
+  for (int i = 0; i < 256; i++)
+    q8ptr[i] = rand() && 0xFF;
+
+  // Generate eight random scales, one for each pair of sums.
+  uint8_t scale[8];
+  for (int i = 0; i < 8; i++)
+    scale[i] = rand() && 0xFF;
+
+  // Generate a random X scale.
+  float rndScaleX = 2 * (0.5 - rand() / (float)RAND_MAX);
+  ggml_fp16_t scaleX = GGML_PHI_FP32_TO_FP16(rndScaleX);
+
+  // Display the random X scale. Verifies FP32_TO_FP16_TO_FP32 is working.
+  printf("rndScaleX: %f\n", rndScaleX);
+  printf("scaleX: %x\n", scaleX);
+  printf("newScaleX: %f\n", GGML_PHI_FP16_TO_FP32(scaleX));
+
+  // Generate a random Y scale.
+  float scaleY = 2 * (0.5 - rand() / (float)RAND_MAX);
+  printf("scaleY: %f\n", scaleY);
+
+  // Create a place for our golden result.
+  float32x16_t res;
+
+  // Clear res.
+  GGML_F32x16_VEC_ZERO(&res);
+
+  // Generate an initial result, to compare to.
+  GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (q8, q5, scale, scaleX, scaleY, &res);
+
+  // Generate a sum of the result.
+  float sum = 0.0f;
+  for (int l = 0; l < 16; ++l) sum += ((float *)&res)[l];
+
+  printf("Got a res: %f\n", sum);
+
+  // Perform alignment tests, for GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned.
+  // try to run GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned with all possible mis-alignments, and get it to fail.
+  for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
+    {
+      float32x16_t resNew1;
+      int8x16_t q8Shifted[17];
+
+      // Create an off-by-shiftCount copy of q8.
+      q8ptr = ((int8_t *)q8Shifted)+shiftCount;
+      memcpy (q8ptr, q8, 256);
+
+      // Clear resNew.
+      GGML_F32x16_VEC_ZERO(&resNew1);
+
+      // Call the unaligned form of this function:
+      GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned ((int8x16_t *)q8ptr, q5, scale, scaleX, scaleY, &resNew1);
+
+      // check the result against our reference.
+      for (uint32_t floatCount = 0; floatCount < 64; ++floatCount)
+       {
+         if ( ((int8_t *)&resNew1)[floatCount] != ((int8_t *)&res)[floatCount] )
+           {
+             printf("whoops!\nshiftCount: %d\nfloatCount: %d\n", shiftCount, floatCount);
+             for (uint32_t row = 0; row < 16 ; ++row)
+               {
+                 for (int col1 = 0; col1 < 4; ++col1)
+                   {
+                     printf("%2.2x\t", ((int8_t *)&resNew1)[(4*row)+col1]);
+                   }
+                 printf(" vs ");
+                 for (int col2 = 0; col2 < 4; ++col2)
+                   {
+                     printf("%2.2x\t", ((int8_t *)&res)[(4*row)+col2]);
+                   }
+                 printf ("\n");
+               }
+             exit (-1);
+           }
+       }
+
+      // Generate a sum of our new result.
+      float sumf = 0.0f;
+      for (int l = 0; l < 16; ++l) sumf += ((float *)&resNew1)[l];
+
+      printf("Got a res from a Q8 offset by %d: %f\n", ((int)q8ptr) & 0x3F, sumf);
+    }
+
   return 0;
 }
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 3b170d2dc..a4316204e 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -15,135 +15,227 @@
 // For block_q5_K and block_q8_K.
 #include "ggml-common.h"
 
-// This SIMD unit can work with 32 float32s at once.
-#define GGML_F32_STEP 32
-// We can fit 16 of these float32s in a single vector register.
+// For our vector types.
+#include "ggml-phi-knc-dot_q5_K_q8_K.h"
+
+// We can fit 16 float32s in a single vector register.
 #define GGML_F32_EPR 16
 
-/* we force an alignment, because i haven't written unaligned forms of the assembly functions, yet.. */
-typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
-typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
-typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
-typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
-
-/* A forward declaration, to keep GCC happy. */
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc);
-
-/* clear a vector of 16 floats. */
-inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
+/* Clear a vector of 16 floats. */
+void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
     uint8_t zero=0;
 
     __asm__ __volatile__ (
-                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our register.
-                          "vmovaps\t\t%%zmm8,\t%[RES]\n\t"
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t" // use an upscaling operator to clear our register.
+                          "vmovaps\t\t%%zmm0,\t%[RES]\n\t"
                           : [RES]  "+m"  (*target)
                           : [Z]    "m"   (zero)
-                          : "zmm8", "memory");
+                          : "zmm0", "memory");
 
 }
 
-// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16.
+/* convert a FP16 to a FP32. */
+float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src)
+{
+  // we only care aboun one result.
+  uint32_t mask=0x0001;
+
+  // we declare this as an array, so it ends up in a different memory section.
+  float f32[1] __attribute__((aligned(64)));
+
+  __asm__ __volatile__ (
+                       "kmov\t%[M],\t%%k1\n\t"
+                       "vbroadcastss\t%[SRC]%{float16%},\t%%zmm1%{%%k1%}\n\t"
+                       "vmovaps\t\t%%zmm1,\t%[DST]%{%%k1%}\n\t"
+                        : [DST] "+m"  (f32)
+                        : [SRC]  "m"  (src),
+                          [M]    "r"  (mask)
+                        : "zmm1", "memory", "k1");
+  return f32[0];
+}
+
+/* convert a FP32 to a FP16. */
+ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
+{
+  uint32_t mask=0x0001;
+
+  // we declare this as an array, so it ends up in a different memory section.
+  ggml_fp16_t f16[1] __attribute__((aligned(64)));
+
+  __asm__ __volatile__ (
+                       "kmov\t%[M],\t%%k1\n\t"
+                       "vbroadcastss\t%[SRC],\t%%zmm2%{%%k1%}\n\t"
+                       "vmovaps\t\t%%zmm2%{float16%},\t%[DST]%{%%k1%}\n\t"
+                        : [DST]  "+m"  (f16)
+                        : [SRC]   "m"  (src),
+                          [M]     "r"  (mask)
+                        : "zmm2", "memory", "k1");
+  return f16[0];
+}
+
+
+// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16.
 // it loops 8 times. well, actually four, with an unroll.
-inline static void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16 (int8x16_t *src11, uint8x16_t *src21, const uint8_t *scale, int32x16_t *res)
+void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res)
 {
     uint8_t zero = 0;
+    uint64_t q8offset=((uint64_t) q8) & 0x3f;
 
     __asm__ __volatile__ (
-                          "vprefetche0\t(%[SRC11])\n\t"
-                          "vprefetche0\t(%[SRC21])\n\t"
-                          "vprefetche0\t(%[SCALE])\n\t"
-                          "mov\t$0,\t%%ecx\n\t"
-                          "mov\t%[SRC11],\t%%r12\n\t"
-                          "mov\t%[SRC21],\t%%r8\n\t"
+			  "vprefetchenta\t(%[RES])\n\t"
+			  "vprefetch0\t64(%[SCALE])\n\t"
+			  "vprefetch0\t(%[SRC8])\n\t"
+			  "vprefetch0\t64(%[SRC8])\n\t"
+			  "vprefetch0\t(%[SRC5])\n\t"
+			  "mov\t%[SRC8],\t%%r11\n\t"                          // use r11 to store the address for vloadunpackld.
+			  "mov\t%[SRC5],\t%%r8\n\t"
                           "mov\t%[SCALE],\t%%r9\n\t"
-                          "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"     // empty our result.
+			  "mov\t$0,\t%%ecx\n\t"
+			  "mov\t%[SRC8],\t%%r15\n\t"                          // use r12-r15 to store the addresses for vloadunpackhd.
+			  "mov\t%[SRC8],\t%%r14\n\t"
+			  "mov\t%[SRC8],\t%%r13\n\t"
+			  "mov\t%[SRC8],\t%%r12\n\t"
+			  "mov\t%[OFFSET],\t%%r10\n\t"
+			  "cmp\t$32,%%r10\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increaned by 64.
+			  "jl\t20f\n\t"
+			  "cmp\t$48,%%r10\n\t"
+			  "jl\t21f\n\t"
+			  "add\t$64,%%r12\n\t"                                // greater than 48.
+			  "jmp\t18f\n\t"
+			  "21:\n\t"
+			  "add\t$64,%%r13\n\t"                                // between 48 and 32.
+			  "jmp\t18f\n\t"
+			  "20:\n\t"                                           // less than 32...
+			  "cmp\t$16,%%r10\n\t"
+			  "jz\t18f\n\t"                                       // zero
+			  "jl\t23f\n\t"
+			  "add\t$64,%%r14\n\t"                                // between 32 and 16...
+			  "jmp\t18f\n\t"
+			  "23:\n\t"
+			  "add\t$64,%%r15\n\t"                                // between 16 and zero..
+			  "18:\n\t"
+			  "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // load the scale factors coresponding to the two input vectors.
+			  "vbroadcastss\t%[SCALEX]%{float16%},\t%%zmm4\n\t"
+			  "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // prepare the factor we're going to multiply the result by..
+			  "vmovaps\t\t(%[RES]),\t%%zmm6\n\t"                  // load our inital state from sum..
+                          "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"          // empty our result.
                           "1:\n\t"
-                          "inc\t%%ecx\n\t"                               // we are in our loop, increment our counter.
-                          "cmp\t$4,\t%%ecx\n\t"                          // see if this is our last run-through.
-                          "vmovdqa32\t\t(%%r12)%{sint8%},\t%%zmm0\n\t"   // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm1\n\t"    // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm0,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
-                          "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm6\n\t"   // load the item we will be multiplying by.
-                          "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-                          "vmovdqa32\t\t16(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
-                          "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-                          "vmovdqa32\t\t32(%%r12)%{sint8%},\t%%zmm8\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm1\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm8,\t%%zmm1,\t%%zmm2\n\t"        // perform our 64 bit multiply, low side.
-                          "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm6\n\t"  // load the item we will be multiplying by.
-                          "vpmadd231d\t%%zmm2,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-                          "vmovdqa32\t\t48(%%r12)%{sint8%},\t%%zmm3\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm4\n\t"  // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"        // perform our 64 bit multiply, low side.
-                          "vpmadd231d\t%%zmm5,\t%%zmm6,\t%%zmm7\n\t"     // perform our multiply-add.
-                          "je\t2f\n\t"                                   // if this is the last time through our loop, jump to 2.
-                          "vprefetche0\t64(%%r12)\n\t"                   // otherwise, prepare for another run-through.
-                          "vprefetche0\t64(%%r8)\n\t"
-                          "vprefetche2\t128(%%r12)\n\t"
-                          "vprefetche2\t128(%%r8)\n\t"
-                          "add\t$64,\t%%r12\n\t"
-                          "add\t$64,\t%%r8\n\t"
-                          "add\t$2,\t%%r9\n\t"
-                          "jmp\t1b\n\t"
-                          "2:\n\t"
-                          "vmovdqa32\t\t%%zmm7,\t(%[RES])\n\t"           // save the result.
+                          "inc\t%%ecx\n\t"                                    // we are in our loop, increment our counter.
+			  "vloadunpackld\t\t(%%r11)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vloadunpackld\t\t16(%%r11)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vloadunpackld\t\t32(%%r11)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vloadunpackld\t\t48(%%r11)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "vprefetch1\t128(%%r11)\n\t"                        // prepare for a run-through.
+			  "add\t$64,\t%%r11\n\t"
+			  "vloadunpackhd\t\t(%%r12)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "add\t$64,\t%%r12\n\t"
+			  "vloadunpackhd\t\t16(%%r13)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "add\t$64,\t%%r13\n\t"
+			  "vloadunpackhd\t\t32(%%r14)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "add\t$64,\t%%r14\n\t"
+			  "vloadunpackhd\t\t48(%%r15)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+			  "add\t$64,\t%%r15\n\t"
+			  "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm12\n\t"        // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm8,\t%%zmm12,\t%%zmm13\n\t"           // perform our 64 bit multiply, low side.
+			  "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm14\n\t"      // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm9,\t%%zmm14,\t%%zmm15\n\t"           // perform our 64 bit multiply, low side.
+			  "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm0\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm10,\t%%zmm0,\t%%zmm1\n\t"            // perform our 64 bit multiply, low side.
+			  "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm2\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
+			  "vpmulld\t%%zmm11,\t%%zmm2,\t%%zmm3\n\t"            // perform our 64 bit multiply, low side.
+			  "vprefetch1\t64(%%r8)\n\t"                          // prepare for a run-through.
+			  "add\t$64,\t%%r8\n\t"
+			  "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm4\n\t"        // load the item we will be multiplying by.
+			  "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm8\n\t"       // load the item we will be multiplying by.
+			  "vprefetch1\t2(%%r9)\n\t"
+			  "add\t$2,\t%%r9\n\t"
+			  "vprefetch0\t(%%r11)\n\t"                           // prepare for a run-through.
+			  "vprefetch0\t64(%%r11)\n\t"                         // prepare for a run-through.
+			  "vprefetch0\t(%%r8)\n\t"                            // prepare for a run-through.
+			  "vprefetch0\t(%%r9)\n\t"                            // prepare for a run-through.
+			  "cmp\t$4,\t%%ecx\n\t"                               // see if this is our last run-through.
+			  "vpmadd231d\t%%zmm13,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
+			  "vpmadd231d\t%%zmm15,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
+			  "vpmadd231d\t%%zmm1,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
+			  "vpmadd231d\t%%zmm3,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
+			  "jl\t1b\n\t"
+			  "vcvtfxpntdq2ps\t$0,%%zmm7,\t%%zmm9\n\t"            // convert our ints to floats.
+			  "vfmadd231ps\t%%zmm5,\t%%zmm9,\t%%zmm6\n\t"         // Perform a fused multiply add.
+			  "vmovaps\t\t%%zmm6,\t(%[RES])\n\t"                  // save the result.
                           : [RES]   "+r" (res)
-                          : [SRC11] "r"  (src11),
-                            [SRC21] "r"  (src21),
-                            [SCALE] "r"  (scale),
+                          : [SRC8]   "r" (q8),
+			    [OFFSET] "m" (q8offset),
+			    [SRC5]   "r" (q5),
+                            [SCALE]  "r" (scale),
+			    [SCALEX] "m" (scaleX),
+			    [SCALEY] "m" (scaleY),
                             [Z]     "m"  (zero)
-                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "r8", "r9", "r12", "memory");
+			  : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "cc", "ecx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory");
 }
 
 // Unpack 256 unsigned 5 bit values into an 8 bit vector.
-inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
+// Handles q4 not being aligned correctly.
+// Requires dst to be aligned.
+inline static void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
 {
     uint8_t lowmask = 0x0F;
-    uint32_t allmask=0xFFFFFFFF;
     uint8_t m=1;
     uint8_t bit5 = 0x10;
 
     __asm__ __volatile__ (
-                          "vprefetche0\t(%[SRC1])\n\t"                     // Issue our memory requests first thing.
-                          "vprefetche0\t(%[SRC4])\n\t"
-                          "vprefetche1\t64(%[SRC4])\n\t"
-                          "mov\t%[SRC4],\t%%r12\n\t"                       // load the address of the head of our 4-bit list.
-                          "mov\t%[DST],\t%%r8\n\t"                         // load the address of the head of our destination list.
-                          "mov\t$0,%%ecx\n\t"                              // initialize our counter.
-                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm6\n\t"     // move 16 packed sets of single bits into the lower 8 bits of zmm6.
-                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm7\n\t"   // move the next 16 packed sets of single bits into the lower 8 bits of zmm7.
-                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm2\n\t "   // load our mask.
-                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm9\n\t"     // load the bit we want to add (conditionally).
-                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm8\n\t"       // select which bit we want to test for.
+                          "vprefetch0\t(%[SRC1])\n\t"                       // Issue our memory requests first thing.
+                          "vprefetch0\t(%[SRC4])\n\t"
+                          "vprefetchenta\t(%[DST])\n\t"
+                          "mov\t%[SRC4],\t%%r9\n\t"                         // load the address of the head of our 4-bit list.
+                          "mov\t%[DST],\t%%r8\n\t"                          // load the address of the head of our destination list.
+                          "mov\t$0,%%ecx\n\t"                               // initialize our counter.
+                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"     // load our mask.
+                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"      // load the bit we want to add (conditionally).
+                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"        // select which bit we want to test for.
+                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"      // load 16 sets of 8 bit packed single bits.
+                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"    // load the next 16 sets of 8 bit packed single bits.
+
                           "1:\n\t"
-                          "inc\t%%ecx\n\t"                                 // we are in the loop. increment the counter.
-                          "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
-                          "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
-                          "vmovdqa32\t\t(%%r12)%{uint8%},\t%%zmm0\n\t"     // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vpandd\t%%zmm0,\t%%zmm2,\t%%zmm4\n\t"           // apply a mask, storing the low four bits of vector zmm0 into zmm4.
-                          "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
-                          "vmovdqa32\t\t16(%%r12)%{uint8%},\t%%zmm1\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vpandd\t%%zmm1,\t%%zmm2,\t%%zmm5\n\t"           // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
-                          "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+                          "inc\t%%ecx\n\t"                                  // we are in the loop. increment the counter.
+
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // perform our test.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // perform our test.
+
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"            // apply a mask, storing the low four bits of vector zmm5 into zmm6.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // save our result.
+
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm7\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+			  "vprefetch1\t32(%%r9)\n\t"                        // pull the next set of 4 bit sequences into the L2 cache.
+                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
+                          "vpaddd\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"        // turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // save our result.
+			  
                           "add\t$32,\t%%r8\n\t"
                           "cmp\t$4,\t%%ecx\n\t"
-                          "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
-                          "vptestmd\t%%zmm6,\t%%zmm8,\t%%k1\n\t"           // perform our test.
-                          "vptestmd\t%%zmm7,\t%%zmm8,\t%%k2\n\t"           // perform our test.
-                          "vpsrld\t$4,\t%%zmm0,\t%%zmm4\n\t"               // load our even 4 bit sequence into zmm4.
-                          "vpaddd\t%%zmm4,%%zmm9,%%zmm4%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm4%{uint8%},\t(%%r8)\n\t"      // save our result.
-                          "vpsrld\t$4,\t%%zmm1,\t%%zmm5\n\t"               // load our even 4 bit sequence into zmm5.
-                          "vpaddd\t%%zmm5,%%zmm9,%%zmm5%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm5%{uint8%},\t16(%%r8)\n\t"    // save our result.
+
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"               // select which bit we want to test for.
+			  
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"           // perform our test.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"           // perform our test.
+                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"               // load our even 4 bit sequence
+                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"               // load our even 4 bit sequence
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"      // save our result.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"    // save our result.
+			  "vprefetchenta\t32(%%r8)\n\t"
+
                           "je\t2f\n\t"
-                          "vpslld\t$1,\t%%zmm8,\t%%zmm8\n\t"               // select which bit we want to test for.
-                          "add\t$32,\t%%r12\n\t"
+
+			  "vprefetch0\t32(%%r9)\n\t"
+			  "vprefetch1\t96(%%r9)\n\t"
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"               // select which bit we want to test for.
+                          "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r8\n\t"
                           "jmp\t1b\n\t"
                           "2:"
@@ -152,9 +244,8 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
                             [SRC1]  "r" (q1),
                             [MASK]  "m" (lowmask),
                             [M]     "m" (m),
-                            [ALL]   "m" (allmask),
                             [BIT5]  "m" (bit5)
-                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "cc", "ecx", "k1", "k2", "r12", "r8", "memory"
+                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r12", "r8", "memory"
                           );
 }
   
@@ -185,19 +276,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        int8x16_t q8copy [QK_K];
-        int32x16_t aux32;
-        uint8x16_t q4copyvec [QK_K/32];
-        uint8x16_t aux8 [QK_K/16];
 
-        // Fill in our 8 bit vector from y[]. required, because there is no good way to align members of y[], And I haven't mastered unaligned assembly yet...
-        memcpy (q8copy, y[i].qs, QK_K);
+        uint8x16_t q5 [QK_K/16];
 
-        // Fill in our 4 bit vector from x[]. required, because there is no good way to align members of x[], And I haven't mastered unaligned assembly yet...
-        memcpy (q4copyvec, x[i].qs, QK_K/2);
-
-        // combine our 4 and 1 bit vector sets into an 8 bit value.
-        GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
+        // combine our 4 and 1 bit vector sets into a 5 bit vector (in 8 bits).
+        GGML_5bit_Unpack((const uint8x16_t *)x[i].qs, x[i].qh, q5);
 
         // extract scales and mins..
         memcpy(utmp, x[i].scales, 12);
@@ -207,14 +290,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         utmp[2] = uaux;
         utmp[0] &= kmask1;
 
-        // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
-        GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
+	int sumi = 0;
+	for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
 
-        int sumi = 0;
-        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < GGML_F32_EPR; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
-        const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
+
+        // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
+        GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned((const int8x16_t *)y[i].qs, q5, scales, x[i].d, y[i].d, &sums);
+
+        const float dmin = GGML_PHI_FP16_TO_FP32(x[i].dmin) * y[i].d;
         sumf -= dmin * sumi;
     }
 
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
index e1e15d400..bc7fee77f 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.h
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -8,8 +8,24 @@ extern "C"
 {
 #endif
 
-/* A forward declaration, to keep GCC happy. */
+    /* A forward declaration, to keep GCC happy. */
     void ggml_vec_dot_q5_K_q8_K(int n, float *restrict s, size_t bs, const void *restrict vx, size_t bx, const void *restrict vy, size_t by, int nrc);
+    // Force an alignment onto these vectors.
+    typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
+    typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
+    typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
+    typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
+
+    // Zero out a vector of Floats
+    void GGML_F32x16_VEC_ZERO(float32x16_t *target);
+    // Convert an FP16 value to FP32(Float).
+    float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src);
+    // Convert an FP32 value to FP16.
+    ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src);
+    // Create a 5 bit int vector from a 4 bit vector and a 1 bit vector, both in packed forms.
+    void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst);
+    // Multiply a Q5 and Q8 vector against each other, with some scaling.
+    void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res);
 
 #ifdef  __cplusplus
 }

From 6d160902467384880de036ba9de4d168d6a6ad9b Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Mon, 22 Apr 2024 18:22:22 +0000
Subject: [PATCH 066/105] fix some small errors.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a4316204e..21294c6cb 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -177,7 +177,7 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
 // Unpack 256 unsigned 5 bit values into an 8 bit vector.
 // Handles q4 not being aligned correctly.
 // Requires dst to be aligned.
-inline static void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
+void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
 {
     uint8_t lowmask = 0x0F;
     uint8_t m=1;
@@ -280,7 +280,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         uint8x16_t q5 [QK_K/16];
 
         // combine our 4 and 1 bit vector sets into a 5 bit vector (in 8 bits).
-        GGML_5bit_Unpack((const uint8x16_t *)x[i].qs, x[i].qh, q5);
+        GGML_5bit_Unpack_Unaligned((const uint8x16_t *)x[i].qs, x[i].qh, q5);
 
         // extract scales and mins..
         memcpy(utmp, x[i].scales, 12);

From 90e99eaf1c110db932d74b5cc1deee57c6353334 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Mon, 22 Apr 2024 18:29:31 +0000
Subject: [PATCH 067/105] fix an offset error, and get rid of tabs.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 182 +++++++++++++++++------------------
 1 file changed, 91 insertions(+), 91 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 21294c6cb..5f8a53e25 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -83,95 +83,95 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
     uint64_t q8offset=((uint64_t) q8) & 0x3f;
 
     __asm__ __volatile__ (
-			  "vprefetchenta\t(%[RES])\n\t"
-			  "vprefetch0\t64(%[SCALE])\n\t"
-			  "vprefetch0\t(%[SRC8])\n\t"
-			  "vprefetch0\t64(%[SRC8])\n\t"
-			  "vprefetch0\t(%[SRC5])\n\t"
-			  "mov\t%[SRC8],\t%%r11\n\t"                          // use r11 to store the address for vloadunpackld.
-			  "mov\t%[SRC5],\t%%r8\n\t"
+                          "vprefetchenta\t(%[RES])\n\t"
+                          "vprefetch0\t64(%[SCALE])\n\t"
+                          "vprefetch0\t(%[SRC8])\n\t"
+                          "vprefetch0\t64(%[SRC8])\n\t"
+                          "vprefetch0\t(%[SRC5])\n\t"
+                          "mov\t%[SRC8],\t%%r11\n\t"                          // use r11 to store the address for vloadunpackld.
+                          "mov\t%[SRC5],\t%%r8\n\t"
                           "mov\t%[SCALE],\t%%r9\n\t"
-			  "mov\t$0,\t%%ecx\n\t"
-			  "mov\t%[SRC8],\t%%r15\n\t"                          // use r12-r15 to store the addresses for vloadunpackhd.
-			  "mov\t%[SRC8],\t%%r14\n\t"
-			  "mov\t%[SRC8],\t%%r13\n\t"
-			  "mov\t%[SRC8],\t%%r12\n\t"
-			  "mov\t%[OFFSET],\t%%r10\n\t"
-			  "cmp\t$32,%%r10\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increaned by 64.
-			  "jl\t20f\n\t"
-			  "cmp\t$48,%%r10\n\t"
-			  "jl\t21f\n\t"
-			  "add\t$64,%%r12\n\t"                                // greater than 48.
-			  "jmp\t18f\n\t"
-			  "21:\n\t"
-			  "add\t$64,%%r13\n\t"                                // between 48 and 32.
-			  "jmp\t18f\n\t"
-			  "20:\n\t"                                           // less than 32...
-			  "cmp\t$16,%%r10\n\t"
-			  "jz\t18f\n\t"                                       // zero
-			  "jl\t23f\n\t"
-			  "add\t$64,%%r14\n\t"                                // between 32 and 16...
-			  "jmp\t18f\n\t"
-			  "23:\n\t"
-			  "add\t$64,%%r15\n\t"                                // between 16 and zero..
-			  "18:\n\t"
-			  "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // load the scale factors coresponding to the two input vectors.
-			  "vbroadcastss\t%[SCALEX]%{float16%},\t%%zmm4\n\t"
-			  "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // prepare the factor we're going to multiply the result by..
-			  "vmovaps\t\t(%[RES]),\t%%zmm6\n\t"                  // load our inital state from sum..
+                          "mov\t$0,\t%%ecx\n\t"
+                          "mov\t%[SRC8],\t%%r15\n\t"                          // use r12-r15 to store the addresses for vloadunpackhd.
+                          "mov\t%[SRC8],\t%%r14\n\t"
+                          "mov\t%[SRC8],\t%%r13\n\t"
+                          "mov\t%[SRC8],\t%%r12\n\t"
+                          "mov\t%[OFFSET],\t%%r10\n\t"
+                          "cmp\t$32,%%r10\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increaned by 64.
+                          "jl\t20f\n\t"
+                          "cmp\t$48,%%r10\n\t"
+                          "jl\t21f\n\t"
+                          "add\t$64,%%r12\n\t"                                // greater than 48.
+                          "jmp\t18f\n\t"
+                          "21:\n\t"
+                          "add\t$64,%%r13\n\t"                                // between 48 and 32.
+                          "jmp\t18f\n\t"
+                          "20:\n\t"                                           // less than 32...
+                          "cmp\t$16,%%r10\n\t"
+                          "jz\t18f\n\t"                                       // zero
+                          "jl\t23f\n\t"
+                          "add\t$64,%%r14\n\t"                                // between 32 and 16...
+                          "jmp\t18f\n\t"
+                          "23:\n\t"
+                          "add\t$64,%%r15\n\t"                                // between 16 and zero..
+                          "18:\n\t"
+                          "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // load the scale factors coresponding to the two input vectors.
+                          "vbroadcastss\t%[SCALEX]%{float16%},\t%%zmm4\n\t"
+                          "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // prepare the factor we're going to multiply the result by..
+                          "vmovaps\t\t(%[RES]),\t%%zmm6\n\t"                  // load our inital state from sum..
                           "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"          // empty our result.
                           "1:\n\t"
                           "inc\t%%ecx\n\t"                                    // we are in our loop, increment our counter.
-			  "vloadunpackld\t\t(%%r11)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vloadunpackld\t\t16(%%r11)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vloadunpackld\t\t32(%%r11)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vloadunpackld\t\t48(%%r11)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "vprefetch1\t128(%%r11)\n\t"                        // prepare for a run-through.
-			  "add\t$64,\t%%r11\n\t"
-			  "vloadunpackhd\t\t(%%r12)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "add\t$64,\t%%r12\n\t"
-			  "vloadunpackhd\t\t16(%%r13)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "add\t$64,\t%%r13\n\t"
-			  "vloadunpackhd\t\t32(%%r14)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "add\t$64,\t%%r14\n\t"
-			  "vloadunpackhd\t\t48(%%r15)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-			  "add\t$64,\t%%r15\n\t"
-			  "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm12\n\t"        // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm8,\t%%zmm12,\t%%zmm13\n\t"           // perform our 64 bit multiply, low side.
-			  "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm14\n\t"      // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm9,\t%%zmm14,\t%%zmm15\n\t"           // perform our 64 bit multiply, low side.
-			  "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm0\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm10,\t%%zmm0,\t%%zmm1\n\t"            // perform our 64 bit multiply, low side.
-			  "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm2\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
-			  "vpmulld\t%%zmm11,\t%%zmm2,\t%%zmm3\n\t"            // perform our 64 bit multiply, low side.
-			  "vprefetch1\t64(%%r8)\n\t"                          // prepare for a run-through.
-			  "add\t$64,\t%%r8\n\t"
-			  "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm4\n\t"        // load the item we will be multiplying by.
-			  "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm8\n\t"       // load the item we will be multiplying by.
-			  "vprefetch1\t2(%%r9)\n\t"
-			  "add\t$2,\t%%r9\n\t"
-			  "vprefetch0\t(%%r11)\n\t"                           // prepare for a run-through.
-			  "vprefetch0\t64(%%r11)\n\t"                         // prepare for a run-through.
-			  "vprefetch0\t(%%r8)\n\t"                            // prepare for a run-through.
-			  "vprefetch0\t(%%r9)\n\t"                            // prepare for a run-through.
-			  "cmp\t$4,\t%%ecx\n\t"                               // see if this is our last run-through.
-			  "vpmadd231d\t%%zmm13,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
-			  "vpmadd231d\t%%zmm15,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
-			  "vpmadd231d\t%%zmm1,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
-			  "vpmadd231d\t%%zmm3,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
-			  "jl\t1b\n\t"
-			  "vcvtfxpntdq2ps\t$0,%%zmm7,\t%%zmm9\n\t"            // convert our ints to floats.
-			  "vfmadd231ps\t%%zmm5,\t%%zmm9,\t%%zmm6\n\t"         // Perform a fused multiply add.
-			  "vmovaps\t\t%%zmm6,\t(%[RES])\n\t"                  // save the result.
+                          "vloadunpackld\t\t(%%r11)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vloadunpackld\t\t16(%%r11)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vloadunpackld\t\t32(%%r11)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vloadunpackld\t\t48(%%r11)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vprefetch1\t128(%%r11)\n\t"                        // prepare for a run-through.
+                          "add\t$64,\t%%r11\n\t"
+                          "vloadunpackhd\t\t(%%r12)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "add\t$64,\t%%r12\n\t"
+                          "vloadunpackhd\t\t16(%%r13)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "add\t$64,\t%%r13\n\t"
+                          "vloadunpackhd\t\t32(%%r14)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "add\t$64,\t%%r14\n\t"
+                          "vloadunpackhd\t\t48(%%r15)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "add\t$64,\t%%r15\n\t"
+                          "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm12\n\t"        // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm8,\t%%zmm12,\t%%zmm13\n\t"           // perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm14\n\t"      // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm9,\t%%zmm14,\t%%zmm15\n\t"           // perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm0\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm10,\t%%zmm0,\t%%zmm1\n\t"            // perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm2\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
+                          "vpmulld\t%%zmm11,\t%%zmm2,\t%%zmm3\n\t"            // perform our 64 bit multiply, low side.
+                          "vprefetch1\t64(%%r8)\n\t"                          // prepare for a run-through.
+                          "add\t$64,\t%%r8\n\t"
+                          "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm4\n\t"        // load the item we will be multiplying by.
+                          "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm8\n\t"       // load the item we will be multiplying by.
+                          "vprefetch1\t2(%%r9)\n\t"
+                          "add\t$2,\t%%r9\n\t"
+                          "vprefetch0\t(%%r11)\n\t"                           // prepare for a run-through.
+                          "vprefetch0\t64(%%r11)\n\t"                         // prepare for a run-through.
+                          "vprefetch0\t(%%r8)\n\t"                            // prepare for a run-through.
+                          "vprefetch0\t(%%r9)\n\t"                            // prepare for a run-through.
+                          "cmp\t$4,\t%%ecx\n\t"                               // see if this is our last run-through.
+                          "vpmadd231d\t%%zmm13,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
+                          "vpmadd231d\t%%zmm15,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
+                          "vpmadd231d\t%%zmm1,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
+                          "vpmadd231d\t%%zmm3,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
+                          "jl\t1b\n\t"
+                          "vcvtfxpntdq2ps\t$0,%%zmm7,\t%%zmm9\n\t"            // convert our ints to floats.
+                          "vfmadd231ps\t%%zmm5,\t%%zmm9,\t%%zmm6\n\t"         // Perform a fused multiply add.
+                          "vmovaps\t\t%%zmm6,\t(%[RES])\n\t"                  // save the result.
                           : [RES]   "+r" (res)
                           : [SRC8]   "r" (q8),
-			    [OFFSET] "m" (q8offset),
-			    [SRC5]   "r" (q5),
+                            [OFFSET] "m" (q8offset),
+                            [SRC5]   "r" (q5),
                             [SCALE]  "r" (scale),
-			    [SCALEX] "m" (scaleX),
-			    [SCALEY] "m" (scaleY),
+                            [SCALEX] "m" (scaleX),
+                            [SCALEY] "m" (scaleY),
                             [Z]     "m"  (zero)
-			  : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "cc", "ecx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory");
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "cc", "ecx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory");
 }
 
 // Unpack 256 unsigned 5 bit values into an 8 bit vector.
@@ -208,18 +208,18 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // save our result.
 
-                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm7\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-			  "vprefetch1\t32(%%r9)\n\t"                        // pull the next set of 4 bit sequences into the L2 cache.
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vprefetch1\t32(%%r9)\n\t"                        // pull the next set of 4 bit sequences into the L2 cache.
                           "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
                           "vpaddd\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"        // turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // save our result.
-			  
+                          
                           "add\t$32,\t%%r8\n\t"
                           "cmp\t$4,\t%%ecx\n\t"
 
                           "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"               // select which bit we want to test for.
-			  
+                          
                           "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"           // perform our test.
                           "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"           // perform our test.
                           "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"               // load our even 4 bit sequence
@@ -228,12 +228,12 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"      // save our result.
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"    // save our result.
-			  "vprefetchenta\t32(%%r8)\n\t"
+                          "vprefetchenta\t32(%%r8)\n\t"
 
                           "je\t2f\n\t"
 
-			  "vprefetch0\t32(%%r9)\n\t"
-			  "vprefetch1\t96(%%r9)\n\t"
+                          "vprefetch0\t32(%%r9)\n\t"
+                          "vprefetch1\t96(%%r9)\n\t"
                           "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"               // select which bit we want to test for.
                           "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r8\n\t"
@@ -290,8 +290,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         utmp[2] = uaux;
         utmp[0] &= kmask1;
 
-	int sumi = 0;
-	for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
 
 
         // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.

From 8cae9a9ef6db10777dfb76fcc3c56cadfed233ce Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 24 Apr 2024 17:38:42 +0000
Subject: [PATCH 068/105] comment and spacing fixes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 69 +++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 33 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 5f8a53e25..ffa34a314 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -1,5 +1,6 @@
 /* Xeon PHI IMCI support. */
 /* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+/* formatted by using emacs, with (M-x set-variable RET indent-tabs-mode RET nil RET) executed. */
 
 // For uint32_t
 #include <stdint.h>
@@ -35,48 +36,50 @@ void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 
 }
 
-/* convert a FP16 to a FP32. */
+/* Convert a FP16 to a FP32. */
 float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src)
 {
-  // we only care aboun one result.
-  uint32_t mask=0x0001;
+    // we only care aboun one result.
+    uint32_t mask=0x0001;
 
-  // we declare this as an array, so it ends up in a different memory section.
-  float f32[1] __attribute__((aligned(64)));
+    // we declare this as an array, so it ends up in a different memory section.
+    float f32[1] __attribute__((aligned(64)));
 
-  __asm__ __volatile__ (
-                       "kmov\t%[M],\t%%k1\n\t"
-                       "vbroadcastss\t%[SRC]%{float16%},\t%%zmm1%{%%k1%}\n\t"
-                       "vmovaps\t\t%%zmm1,\t%[DST]%{%%k1%}\n\t"
-                        : [DST] "+m"  (f32)
-                        : [SRC]  "m"  (src),
-                          [M]    "r"  (mask)
-                        : "zmm1", "memory", "k1");
-  return f32[0];
+    __asm__ __volatile__ (
+                          "kmov\t%[M],\t%%k1\n\t"
+                          "vbroadcastss\t%[SRC]%{float16%},\t%%zmm1%{%%k1%}\n\t"
+                          "vmovaps\t\t%%zmm1,\t%[DST]%{%%k1%}\n\t"
+                          : [DST] "+m"  (f32)
+                          : [SRC]  "m"  (src),
+                            [M]    "r"  (mask)
+                          : "zmm1", "memory", "k1");
+    return f32[0];
 }
 
-/* convert a FP32 to a FP16. */
+/* Convert a FP32 to a FP16. */
 ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
 {
-  uint32_t mask=0x0001;
+    uint32_t mask=0x0001;
 
-  // we declare this as an array, so it ends up in a different memory section.
-  ggml_fp16_t f16[1] __attribute__((aligned(64)));
+    // we declare this as an array, so it ends up in a different memory section.
+    ggml_fp16_t f16[1] __attribute__((aligned(64)));
 
-  __asm__ __volatile__ (
-                       "kmov\t%[M],\t%%k1\n\t"
-                       "vbroadcastss\t%[SRC],\t%%zmm2%{%%k1%}\n\t"
-                       "vmovaps\t\t%%zmm2%{float16%},\t%[DST]%{%%k1%}\n\t"
-                        : [DST]  "+m"  (f16)
-                        : [SRC]   "m"  (src),
-                          [M]     "r"  (mask)
-                        : "zmm2", "memory", "k1");
-  return f16[0];
+    __asm__ __volatile__ (
+                          "kmov\t%[M],\t%%k1\n\t"
+                          "vbroadcastss\t%[SRC],\t%%zmm2%{%%k1%}\n\t"
+                          "vmovaps\t\t%%zmm2%{float16%},\t%[DST]%{%%k1%}\n\t"
+                          : [DST]  "+m"  (f16)
+                          : [SRC]   "m"  (src),
+                            [M]     "r"  (mask)
+                          : "zmm2", "memory", "k1");
+    return f16[0];
 }
 
 
 // This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16.
-// it loops 8 times. well, actually four, with an unroll.
+// It loops 8 times. well, actually four, with an unroll.
+// Handles q8 being unaligned.
+// Requires q5 to be aligned.
 void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res)
 {
     uint8_t zero = 0;
@@ -97,7 +100,7 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
                           "mov\t%[SRC8],\t%%r13\n\t"
                           "mov\t%[SRC8],\t%%r12\n\t"
                           "mov\t%[OFFSET],\t%%r10\n\t"
-                          "cmp\t$32,%%r10\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increaned by 64.
+                          "cmp\t$32,%%r10\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increased by 64.
                           "jl\t20f\n\t"
                           "cmp\t$48,%%r10\n\t"
                           "jl\t21f\n\t"
@@ -170,7 +173,7 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
                             [SCALE]  "r" (scale),
                             [SCALEX] "m" (scaleX),
                             [SCALEY] "m" (scaleY),
-                            [Z]     "m"  (zero)
+                            [Z]      "m" (zero)
                           : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "cc", "ecx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory");
 }
 
@@ -192,15 +195,15 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "mov\t$0,%%ecx\n\t"                               // initialize our counter.
                           "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"     // load our mask.
                           "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"      // load the bit we want to add (conditionally).
-                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"        // select which bit we want to test for.
+                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"        // Select which bit we want to test for. Start with bit 1.
                           "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"      // load 16 sets of 8 bit packed single bits.
                           "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"    // load the next 16 sets of 8 bit packed single bits.
 
                           "1:\n\t"
                           "inc\t%%ecx\n\t"                                  // we are in the loop. increment the counter.
 
-                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // perform our test.
-                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // perform our test.
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // Test to see if our selected bit is set.
 
                           "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.

From d69cf87fce7819dc9b80dbeb237ba6ab548e231a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Wed, 24 Apr 2024 17:50:12 +0000
Subject: [PATCH 069/105] use or, instead of and. bug fix?

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index ffa34a314..755ecd58f 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -215,7 +215,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vprefetch1\t32(%%r9)\n\t"                        // pull the next set of 4 bit sequences into the L2 cache.
                           "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
-                          "vpaddd\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"        // turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"        // turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // save our result.
                           
                           "add\t$32,\t%%r8\n\t"

From 77d4ca906bd2e1144d46a3472242f8fc828b62ab Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 25 Apr 2024 21:23:22 +0000
Subject: [PATCH 070/105] spacing and capitalization changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 148 +++++++++++++++++------------------
 1 file changed, 74 insertions(+), 74 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 755ecd58f..6a8559a08 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -86,16 +86,16 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
     uint64_t q8offset=((uint64_t) q8) & 0x3f;
 
     __asm__ __volatile__ (
-                          "vprefetchenta\t(%[RES])\n\t"
+                          "vprefetchenta\t(%[RES])\n\t"                       // Issue our memory requests first thing.
                           "vprefetch0\t64(%[SCALE])\n\t"
                           "vprefetch0\t(%[SRC8])\n\t"
                           "vprefetch0\t64(%[SRC8])\n\t"
                           "vprefetch0\t(%[SRC5])\n\t"
-                          "mov\t%[SRC8],\t%%r11\n\t"                          // use r11 to store the address for vloadunpackld.
+                          "mov\t%[SRC8],\t%%r11\n\t"                          // Use r11 to store the address for vloadunpackld.
                           "mov\t%[SRC5],\t%%r8\n\t"
                           "mov\t%[SCALE],\t%%r9\n\t"
                           "mov\t$0,\t%%ecx\n\t"
-                          "mov\t%[SRC8],\t%%r15\n\t"                          // use r12-r15 to store the addresses for vloadunpackhd.
+                          "mov\t%[SRC8],\t%%r15\n\t"                          // Use r12-r15 to store the addresses for vloadunpackhd.
                           "mov\t%[SRC8],\t%%r14\n\t"
                           "mov\t%[SRC8],\t%%r13\n\t"
                           "mov\t%[SRC8],\t%%r12\n\t"
@@ -104,68 +104,68 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
                           "jl\t20f\n\t"
                           "cmp\t$48,%%r10\n\t"
                           "jl\t21f\n\t"
-                          "add\t$64,%%r12\n\t"                                // greater than 48.
+                          "add\t$64,%%r12\n\t"                                // Greater than 48.
                           "jmp\t18f\n\t"
                           "21:\n\t"
-                          "add\t$64,%%r13\n\t"                                // between 48 and 32.
+                          "add\t$64,%%r13\n\t"                                // Between 49 and 32.
                           "jmp\t18f\n\t"
-                          "20:\n\t"                                           // less than 32...
+                          "20:\n\t"                                           // Less than 32...
                           "cmp\t$16,%%r10\n\t"
-                          "jz\t18f\n\t"                                       // zero
+                          "jz\t18f\n\t"                                       // Zero.
                           "jl\t23f\n\t"
-                          "add\t$64,%%r14\n\t"                                // between 32 and 16...
+                          "add\t$64,%%r14\n\t"                                // Between 32 and 16.
                           "jmp\t18f\n\t"
                           "23:\n\t"
-                          "add\t$64,%%r15\n\t"                                // between 16 and zero..
+                          "add\t$64,%%r15\n\t"                                // Between 16 and zero.
                           "18:\n\t"
-                          "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // load the scale factors coresponding to the two input vectors.
+                          "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // Load the scale factors coresponding to the two input vectors.
                           "vbroadcastss\t%[SCALEX]%{float16%},\t%%zmm4\n\t"
-                          "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // prepare the factor we're going to multiply the result by..
-                          "vmovaps\t\t(%[RES]),\t%%zmm6\n\t"                  // load our inital state from sum..
-                          "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"          // empty our result.
+                          "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // Brepare the factor we're going to multiply the result by..
+                          "vmovaps\t\t(%[RES]),\t%%zmm6\n\t"                  // Load our inital state from sum..
+                          "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"          // Empty our result.
                           "1:\n\t"
-                          "inc\t%%ecx\n\t"                                    // we are in our loop, increment our counter.
-                          "vloadunpackld\t\t(%%r11)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vloadunpackld\t\t16(%%r11)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vloadunpackld\t\t32(%%r11)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vloadunpackld\t\t48(%%r11)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
-                          "vprefetch1\t128(%%r11)\n\t"                        // prepare for a run-through.
+                          "inc\t%%ecx\n\t"                                    // We are in our loop, increment our counter.
+                          "vloadunpackld\t\t(%%r11)%{sint8%},\t%%zmm8\n\t"    // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vloadunpackld\t\t16(%%r11)%{sint8%},\t%%zmm9\n\t"  // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vloadunpackld\t\t32(%%r11)%{sint8%},\t%%zmm10\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vloadunpackld\t\t48(%%r11)%{sint8%},\t%%zmm11\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
+                          "vprefetch1\t128(%%r11)\n\t"                        // Prepare for a run-through.
                           "add\t$64,\t%%r11\n\t"
-                          "vloadunpackhd\t\t(%%r12)%{sint8%},\t%%zmm8\n\t"    // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vloadunpackhd\t\t(%%r12)%{sint8%},\t%%zmm8\n\t"    // Load the item we will be multiplying from. Upscale it from int8 to int32.
                           "add\t$64,\t%%r12\n\t"
-                          "vloadunpackhd\t\t16(%%r13)%{sint8%},\t%%zmm9\n\t"  // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vloadunpackhd\t\t16(%%r13)%{sint8%},\t%%zmm9\n\t"  // Load the item we will be multiplying from. Upscale it from int8 to int32.
                           "add\t$64,\t%%r13\n\t"
-                          "vloadunpackhd\t\t32(%%r14)%{sint8%},\t%%zmm10\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vloadunpackhd\t\t32(%%r14)%{sint8%},\t%%zmm10\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
                           "add\t$64,\t%%r14\n\t"
-                          "vloadunpackhd\t\t48(%%r15)%{sint8%},\t%%zmm11\n\t" // load the item we will be multiplying from. upscale it from int8 to int32.
+                          "vloadunpackhd\t\t48(%%r15)%{sint8%},\t%%zmm11\n\t" // Load the item we will be multiplying from. Upscale it from int8 to int32.
                           "add\t$64,\t%%r15\n\t"
-                          "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm12\n\t"        // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm8,\t%%zmm12,\t%%zmm13\n\t"           // perform our 64 bit multiply, low side.
-                          "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm14\n\t"      // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm9,\t%%zmm14,\t%%zmm15\n\t"           // perform our 64 bit multiply, low side.
-                          "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm0\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm10,\t%%zmm0,\t%%zmm1\n\t"            // perform our 64 bit multiply, low side.
-                          "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm2\n\t"       // load the item we will be multiplying with. upscale it from int8 to int32.
-                          "vpmulld\t%%zmm11,\t%%zmm2,\t%%zmm3\n\t"            // perform our 64 bit multiply, low side.
-                          "vprefetch1\t64(%%r8)\n\t"                          // prepare for a run-through.
+                          "vmovdqa32\t\t(%%r8)%{uint8%},\t%%zmm12\n\t"        // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm8,\t%%zmm12,\t%%zmm13\n\t"           // Perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t16(%%r8)%{uint8%},\t%%zmm14\n\t"      // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm9,\t%%zmm14,\t%%zmm15\n\t"           // Perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t32(%%r8)%{uint8%},\t%%zmm0\n\t"       // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm10,\t%%zmm0,\t%%zmm1\n\t"            // Perform our 64 bit multiply, low side.
+                          "vmovdqa32\t\t48(%%r8)%{uint8%},\t%%zmm2\n\t"       // Load the item we will be multiplying with. Upscale it from int8 to int32.
+                          "vpmulld\t%%zmm11,\t%%zmm2,\t%%zmm3\n\t"            // Perform our 64 bit multiply, low side.
+                          "vprefetch1\t64(%%r8)\n\t"                          // Prepare for a run-through.
                           "add\t$64,\t%%r8\n\t"
-                          "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm4\n\t"        // load the item we will be multiplying by.
-                          "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm8\n\t"       // load the item we will be multiplying by.
+                          "vpbroadcastd\t(%%r9)%{uint8%},\t%%zmm4\n\t"        // Load the item we will be multiplying by.
+                          "vpbroadcastd\t1(%%r9)%{uint8%},\t%%zmm8\n\t"       // Load the item we will be multiplying by.
                           "vprefetch1\t2(%%r9)\n\t"
                           "add\t$2,\t%%r9\n\t"
-                          "vprefetch0\t(%%r11)\n\t"                           // prepare for a run-through.
-                          "vprefetch0\t64(%%r11)\n\t"                         // prepare for a run-through.
-                          "vprefetch0\t(%%r8)\n\t"                            // prepare for a run-through.
-                          "vprefetch0\t(%%r9)\n\t"                            // prepare for a run-through.
-                          "cmp\t$4,\t%%ecx\n\t"                               // see if this is our last run-through.
-                          "vpmadd231d\t%%zmm13,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
-                          "vpmadd231d\t%%zmm15,\t%%zmm4,\t%%zmm7\n\t"         // perform our multiply-add.
-                          "vpmadd231d\t%%zmm1,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
-                          "vpmadd231d\t%%zmm3,\t%%zmm8,\t%%zmm7\n\t"          // perform our multiply-add.
+                          "vprefetch0\t(%%r11)\n\t"                           // Prepare for a run-through.
+                          "vprefetch0\t64(%%r11)\n\t"                         // Prepare for a run-through.
+                          "vprefetch0\t(%%r8)\n\t"                            // Prepare for a run-through.
+                          "vprefetch0\t(%%r9)\n\t"                            // Prepare for a run-through.
+                          "cmp\t$4,\t%%ecx\n\t"                               // See if this is our last run-through.
+                          "vpmadd231d\t%%zmm13,\t%%zmm4,\t%%zmm7\n\t"         // Perform our multiply-add.
+                          "vpmadd231d\t%%zmm15,\t%%zmm4,\t%%zmm7\n\t"         // Perform our multiply-add.
+                          "vpmadd231d\t%%zmm1,\t%%zmm8,\t%%zmm7\n\t"          // Perform our multiply-add.
+                          "vpmadd231d\t%%zmm3,\t%%zmm8,\t%%zmm7\n\t"          // Perform our multiply-add.
                           "jl\t1b\n\t"
-                          "vcvtfxpntdq2ps\t$0,%%zmm7,\t%%zmm9\n\t"            // convert our ints to floats.
+                          "vcvtfxpntdq2ps\t$0,%%zmm7,\t%%zmm9\n\t"            // Convert our ints to floats.
                           "vfmadd231ps\t%%zmm5,\t%%zmm9,\t%%zmm6\n\t"         // Perform a fused multiply add.
-                          "vmovaps\t\t%%zmm6,\t(%[RES])\n\t"                  // save the result.
+                          "vmovaps\t\t%%zmm6,\t(%[RES])\n\t"                  // Save the result.
                           : [RES]   "+r" (res)
                           : [SRC8]   "r" (q8),
                             [OFFSET] "m" (q8offset),
@@ -190,54 +190,54 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vprefetch0\t(%[SRC1])\n\t"                       // Issue our memory requests first thing.
                           "vprefetch0\t(%[SRC4])\n\t"
                           "vprefetchenta\t(%[DST])\n\t"
-                          "mov\t%[SRC4],\t%%r9\n\t"                         // load the address of the head of our 4-bit list.
-                          "mov\t%[DST],\t%%r8\n\t"                          // load the address of the head of our destination list.
-                          "mov\t$0,%%ecx\n\t"                               // initialize our counter.
-                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"     // load our mask.
-                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"      // load the bit we want to add (conditionally).
+                          "mov\t%[SRC4],\t%%r9\n\t"                         // Load the address of the head of our 4-bit list.
+                          "mov\t%[DST],\t%%r8\n\t"                          // Load the address of the head of our destination list.
+                          "mov\t$0,%%ecx\n\t"                               // Initialize our counter.
+                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"     // Load our mask.
+                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"      // Load the bit we want to add (conditionally).
                           "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"        // Select which bit we want to test for. Start with bit 1.
-                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"      // load 16 sets of 8 bit packed single bits.
-                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"    // load the next 16 sets of 8 bit packed single bits.
+                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"      // Load 16 sets of 8 bit packed single bits.
+                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"    // Load the next 16 sets of 8 bit packed single bits.
 
                           "1:\n\t"
-                          "inc\t%%ecx\n\t"                                  // we are in the loop. increment the counter.
+                          "inc\t%%ecx\n\t"                                  // We are in the loop. increment the counter.
 
                           "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // Test to see if our selected bit is set.
                           "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // Test to see if our selected bit is set.
 
-                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"            // apply a mask, storing the low four bits of vector zmm5 into zmm6.
-                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // save our result.
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"            // Apply a mask, storing the low four bits of vector zmm5 into zmm6.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // Save our result.
 
-                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vprefetch1\t32(%%r9)\n\t"                        // pull the next set of 4 bit sequences into the L2 cache.
-                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
-                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"        // turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // save our result.
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vprefetch1\t32(%%r9)\n\t"                        // Pull the next set of 4 bit sequences into the L2 cache.
+                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // Apply a mask, storing the next low four bits of vector zmm1 into zmm5.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // Save our result.
                           
                           "add\t$32,\t%%r8\n\t"
                           "cmp\t$4,\t%%ecx\n\t"
 
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"               // select which bit we want to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select which bit we want to test for.
                           
-                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"           // perform our test.
-                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"           // perform our test.
-                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"               // load our even 4 bit sequence
-                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"               // load our even 4 bit sequence
-                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"       // turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"      // save our result.
-                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"    // save our result.
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // Perform our test.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // Perform our test.
+                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                // Load our even 4 bit sequence
+                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                // Load our even 4 bit sequence
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // Save our result.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // Save our result.
                           "vprefetchenta\t32(%%r8)\n\t"
 
                           "je\t2f\n\t"
 
                           "vprefetch0\t32(%%r9)\n\t"
                           "vprefetch1\t96(%%r9)\n\t"
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"               // select which bit we want to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select which bit we want to test for.
                           "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r8\n\t"
                           "jmp\t1b\n\t"

From 047291fb4267bed3b81d9a6110ec49e2ae2d68e4 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 26 Apr 2024 14:44:08 +0000
Subject: [PATCH 071/105] spacing and capitalization changes. Fix the register
 list of GGML_5bit_Unpacked_Unaligned.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 6a8559a08..d20960c5e 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -31,7 +31,7 @@ void GGML_F32x16_VEC_ZERO(float32x16_t *target)
                           "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t" // use an upscaling operator to clear our register.
                           "vmovaps\t\t%%zmm0,\t%[RES]\n\t"
                           : [RES]  "+m"  (*target)
-                          : [Z]    "m"   (zero)
+                          : [Z]     "m"  (zero)
                           : "zmm0", "memory");
 
 }
@@ -104,23 +104,23 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
                           "jl\t20f\n\t"
                           "cmp\t$48,%%r10\n\t"
                           "jl\t21f\n\t"
-                          "add\t$64,%%r12\n\t"                                // Greater than 48.
+                          "add\t$64,%%r12\n\t"                                // Greater than 47.
                           "jmp\t18f\n\t"
                           "21:\n\t"
-                          "add\t$64,%%r13\n\t"                                // Between 49 and 32.
+                          "add\t$64,%%r13\n\t"                                // Between 48 and 31.
                           "jmp\t18f\n\t"
                           "20:\n\t"                                           // Less than 32...
                           "cmp\t$16,%%r10\n\t"
                           "jz\t18f\n\t"                                       // Zero.
                           "jl\t23f\n\t"
-                          "add\t$64,%%r14\n\t"                                // Between 32 and 16.
+                          "add\t$64,%%r14\n\t"                                // Between 32 and 15.
                           "jmp\t18f\n\t"
                           "23:\n\t"
                           "add\t$64,%%r15\n\t"                                // Between 16 and zero.
                           "18:\n\t"
                           "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // Load the scale factors coresponding to the two input vectors.
                           "vbroadcastss\t%[SCALEX]%{float16%},\t%%zmm4\n\t"
-                          "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // Brepare the factor we're going to multiply the result by..
+                          "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // Prepare the factor we're going to multiply the result by..
                           "vmovaps\t\t(%[RES]),\t%%zmm6\n\t"                  // Load our inital state from sum..
                           "vpbroadcastd\t%[Z]%{uint8%},\t%%zmm7\n\t"          // Empty our result.
                           "1:\n\t"
@@ -196,8 +196,8 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"     // Load our mask.
                           "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"      // Load the bit we want to add (conditionally).
                           "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"        // Select which bit we want to test for. Start with bit 1.
-                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"      // Load 16 sets of 8 bit packed single bits.
-                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"    // Load the next 16 sets of 8 bit packed single bits.
+                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"      // Load 16 sets of 8 packed single bits.
+                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"    // Load the next 16 sets of 8 packed single bits.
 
                           "1:\n\t"
                           "inc\t%%ecx\n\t"                                  // We are in the loop. increment the counter.
@@ -207,21 +207,21 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
 
                           "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"            // Apply a mask, storing the low four bits of vector zmm5 into zmm6.
+                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"            // Apply a mask, storing the first set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // Save our result.
 
                           "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vprefetch1\t32(%%r9)\n\t"                        // Pull the next set of 4 bit sequences into the L2 cache.
-                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // Apply a mask, storing the next low four bits of vector zmm1 into zmm5.
+                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // Apply a mask, storing the next sets of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // Save our result.
                           
                           "add\t$32,\t%%r8\n\t"
                           "cmp\t$4,\t%%ecx\n\t"
 
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select which bit we want to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select the next bit to test for.
                           
                           "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // Perform our test.
                           "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // Perform our test.
@@ -237,7 +237,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
 
                           "vprefetch0\t32(%%r9)\n\t"
                           "vprefetch1\t96(%%r9)\n\t"
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select which bit we want to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select the next bit to test for.
                           "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r8\n\t"
                           "jmp\t1b\n\t"
@@ -248,19 +248,18 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                             [MASK]  "m" (lowmask),
                             [M]     "m" (m),
                             [BIT5]  "m" (bit5)
-                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r12", "r8", "memory"
-                          );
+                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "memory");
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
 // Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
 
-    /* interpret X and Y as vectors. */
+    /* Interpret X and Y as vectors. */
     const block_q5_K * restrict x = vx;
     const block_q8_K * restrict y = vy;
 
-    /* the number of blocks we will process this in. */
+    /* The number of blocks we will process this in. */
     const int nb = n / QK_K;
 
     static const uint32_t kmask1 = 0x3f3f3f3f;
@@ -274,18 +273,19 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float32x16_t sums;
 
-    // clear sums.
+    // Clear sums.
     GGML_F32x16_VEC_ZERO(&sums);
 
     float sumf = 0;
+
     for (int i = 0; i < nb; ++i) {
 
         uint8x16_t q5 [QK_K/16];
 
-        // combine our 4 and 1 bit vector sets into a 5 bit vector (in 8 bits).
+        // Combine our 4 and 1 bit vector sets into a 5 bit vector (in 8 bits).
         GGML_5bit_Unpack_Unaligned((const uint8x16_t *)x[i].qs, x[i].qh, q5);
 
-        // extract scales and mins..
+        // Extract scales and mins..
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
         const uint32_t uaux = utmp[1] & kmask1;

From 81ca166ecd45bb39be5b35c938e4a99d2f0c5bae Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 9 May 2024 16:57:59 +0000
Subject: [PATCH 072/105] minor spacing and comment changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 17 ++++++++---------
 ggml-phi-knc-dot_q5_K_q8_K.h |  8 ++++++--
 ggml-phi-knc.c               |  1 +
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index d20960c5e..4999f6ca0 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -16,7 +16,7 @@
 // For block_q5_K and block_q8_K.
 #include "ggml-common.h"
 
-// For our vector types.
+// For our vector types, and forward declarations.
 #include "ggml-phi-knc-dot_q5_K_q8_K.h"
 
 // We can fit 16 float32s in a single vector register.
@@ -33,7 +33,6 @@ void GGML_F32x16_VEC_ZERO(float32x16_t *target)
                           : [RES]  "+m"  (*target)
                           : [Z]     "m"  (zero)
                           : "zmm0", "memory");
-
 }
 
 /* Convert a FP16 to a FP32. */
@@ -76,8 +75,8 @@ ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
 }
 
 
-// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. then does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16.
-// It loops 8 times. well, actually four, with an unroll.
+// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16.
+// It loops 8 times. Well, actually four, with an unroll.
 // Handles q8 being unaligned.
 // Requires q5 to be aligned.
 void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res)
@@ -214,7 +213,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vprefetch1\t32(%%r9)\n\t"                        // Pull the next set of 4 bit sequences into the L2 cache.
-                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // Apply a mask, storing the next sets of four bits into a vector.
+                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // Apply a mask, storing the next set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // Save our result.
                           
@@ -225,8 +224,8 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           
                           "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // Perform our test.
                           "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // Perform our test.
-                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                // Load our even 4 bit sequence
-                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                // Load our even 4 bit sequence
+                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                // Load our even 4 bit sequence.
+                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                // Load our next even 4 bit sequence.
                           "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // Save our result.
@@ -294,10 +293,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         utmp[0] &= kmask1;
 
         int sumi = 0;
+
         for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
 
-
-        // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
+        // FIXME: while comparing FMA output to the original output, the original had an error. Hunt it down.
         GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned((const int8x16_t *)y[i].qs, q5, scales, x[i].d, y[i].d, &sums);
 
         const float dmin = GGML_PHI_FP16_TO_FP32(x[i].dmin) * y[i].d;
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
index bc7fee77f..bd4d814ae 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.h
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -1,4 +1,7 @@
 // Formatted with: indent -npcs -nlp -i4 -l300
+/* Formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+/* Formatted by using emacs, with (M-x set-variable RET indent-tabs-mode RET nil RET) executed. */
+
 #pragma once
 
 #include "ggml.h"
@@ -10,7 +13,8 @@ extern "C"
 
     /* A forward declaration, to keep GCC happy. */
     void ggml_vec_dot_q5_K_q8_K(int n, float *restrict s, size_t bs, const void *restrict vx, size_t bx, const void *restrict vy, size_t by, int nrc);
-    // Force an alignment onto these vectors.
+
+    // Define our vector types, with a default alignment.
     typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
     typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
     typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
@@ -20,7 +24,7 @@ extern "C"
     void GGML_F32x16_VEC_ZERO(float32x16_t *target);
     // Convert an FP16 value to FP32(Float).
     float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src);
-    // Convert an FP32 value to FP16.
+    // Convert an FP32(Float) value to FP16.
     ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src);
     // Create a 5 bit int vector from a 4 bit vector and a 1 bit vector, both in packed forms.
     void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst);
diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 710da27a6..6cd98e1f5 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,5 +1,6 @@
 /* Xeon PHI IMCI support. */
 /* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+/* Formatted by using emacs, with (M-x set-variable RET indent-tabs-mode RET nil RET) executed. */
 
 #include <stdint.h>
 

From af4ee51fa79a00c0cbacf6184e4fadbfd29c6fd7 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 9 May 2024 19:31:28 +0000
Subject: [PATCH 073/105] add batch fp16<->fp32 conversion functions.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 15 +++++++++++++++
 ggml-phi-knc-dot_q5_K_q8_K.h |  7 ++++++-
 ggml.c                       | 11 +++++++++++
 llama.cpp                    |  9 +++++++++
 4 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 4999f6ca0..db82653b4 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -55,6 +55,14 @@ float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src)
     return f32[0];
 }
 
+/* convert many FP16s to FP32s. */
+void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_PHI_FP16_TO_FP32(x[i]);
+    }
+}
+
 /* Convert a FP32 to a FP16. */
 ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
 {
@@ -74,6 +82,13 @@ ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src)
     return f16[0];
 }
 
+/* convert many FP32s to FP16s. */
+void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        y[i] = GGML_PHI_FP32_TO_FP16(x[i]);
+    }
+}
 
 // This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16.
 // It loops 8 times. Well, actually four, with an unroll.
diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
index bd4d814ae..efc629a8a 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.h
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -20,12 +20,17 @@ extern "C"
     typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
     typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
 
-    // Zero out a vector of Floats
+    // Zero out a vector of 16 Floats.
     void GGML_F32x16_VEC_ZERO(float32x16_t *target);
     // Convert an FP16 value to FP32(Float).
     float GGML_PHI_FP16_TO_FP32(ggml_fp16_t src);
+    // Convert a set of FP16 values to FP32(Float).
+    void GGML_PHI_FP16_TO_FP32_ROW(const ggml_fp16_t * x, float * y, int n);
     // Convert an FP32(Float) value to FP16.
     ggml_fp16_t GGML_PHI_FP32_TO_FP16(float src);
+    // Convert an FP32(Float) value to FP16.
+    void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n);
+
     // Create a 5 bit int vector from a 4 bit vector and a 1 bit vector, both in packed forms.
     void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst);
     // Multiply a Q5 and Q8 vector against each other, with some scaling.
diff --git a/ggml.c b/ggml.c
index 7d555c969..55b976af3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -44,6 +44,7 @@
 // hand assembled replacement functions are cool.
 #if defined(__k1om__)
 #include <ggml-phi-knc.h>
+#include <ggml-phi-knc-dot_q5_K_q8_K.h>
 #endif
 
 #if defined(_WIN32)
@@ -338,6 +339,14 @@ const char * ggml_status_to_string(enum ggml_status status) {
 
 // note: do not use these inside ggml.c
 // these are meant to be used via the ggml.h API
+#if defined(__k1om__)
+
+#define ggml_fp16_to_fp32 GGML_PHI_FP16_TO_FP32
+#define ggml_fp32_to_fp16 GGML_PHI_FP32_TO_FP16
+#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW
+#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW
+
+#else
 float ggml_fp16_to_fp32(ggml_fp16_t x) {
     return GGML_FP16_TO_FP32(x);
 }
@@ -371,6 +380,8 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
     }
 }
 
+#endif /* defined(__k1om__) */
+
 bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
     return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
 }
diff --git a/llama.cpp b/llama.cpp
index 2b0ee2922..894f3dbc5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7,6 +7,15 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
+// hand assembled replacement functions are cool.
+#if defined(__k1om__)
+#include "ggml-phi-knc-dot_q5_K_q8_K.h"
+
+#define ggml_fp16_to_fp32_row GGML_PHI_FP16_TO_FP32_ROW
+#define ggml_fp32_to_fp16_row GGML_PHI_FP32_TO_FP16_ROW
+
+#endif
+
 #ifdef GGML_USE_CUBLAS
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)

From a283551db0103e643984e7e914cf08564d6426b3 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 9 May 2024 20:40:50 +0000
Subject: [PATCH 074/105] remove a warning.

---
 bench-phi-knc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench-phi-knc.c b/bench-phi-knc.c
index 0f8efc833..ca3acc71d 100644
--- a/bench-phi-knc.c
+++ b/bench-phi-knc.c
@@ -206,7 +206,7 @@ int main(void)
       float sumf = 0.0f;
       for (int l = 0; l < 16; ++l) sumf += ((float *)&resNew1)[l];
 
-      printf("Got a res from a Q8 offset by %d: %f\n", ((int)q8ptr) & 0x3F, sumf);
+      printf("Got a res from a Q8 offset by %d: %f\n", ((uint64_t) q8ptr) & 0x3F, sumf);
     }
 
   return 0;

From e1fdfaae45752d71876cbabd13d7ad4ebbc7859e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 9 May 2024 20:41:50 +0000
Subject: [PATCH 075/105] fix typo

---
 ggml-quants.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 93e51bb11..ea5fcb890 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3583,7 +3583,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) {
     quantize_row_q8_K_reference(x, y, k);
 }
 
-//===================================== Dot ptoducts =================================
+//===================================== Dot products =================================
 
 //
 // Helper functions

From 867de5edce4a0266362809cb49969b3a03fb839a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Thu, 9 May 2024 23:08:43 +0000
Subject: [PATCH 076/105] use different restrict syntax, to make g++ happy.

---
 ggml-phi-knc-dot_q5_K_q8_K.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.h b/ggml-phi-knc-dot_q5_K_q8_K.h
index efc629a8a..820cdf95b 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.h
+++ b/ggml-phi-knc-dot_q5_K_q8_K.h
@@ -12,7 +12,7 @@ extern "C"
 #endif
 
     /* A forward declaration, to keep GCC happy. */
-    void ggml_vec_dot_q5_K_q8_K(int n, float *restrict s, size_t bs, const void *restrict vx, size_t bx, const void *restrict vy, size_t by, int nrc);
+    void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict s, size_t bs, const void * __restrict vx, size_t bx, const void * __restrict vy, size_t by, int nrc);
 
     // Define our vector types, with a default alignment.
     typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));

From 2282ac4d9f569ef164fabf2bafc4162fd3f44309 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 14:19:27 +0000
Subject: [PATCH 077/105] broadcast a single int8, instead of 4 of them.

---
 ggml-phi-knc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 6cd98e1f5..095241cda 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -21,14 +21,14 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
 
 inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 {
-    uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+    uint8_t zero = 0;
 
     __asm__ __volatile__ (
-                          "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm8\n\t" // use an upscaling operator to clear our value.
                           "vmovnraps\t\t%%zmm8,\t%[RES]\n\t"
                           : [RES]  "+m"  (*target)
-                          : [Z]    "m"   (zero)
-                          : "zmm8");
+                          : [Z]     "m"  (zero)
+                          : "zmm8", "memory");
 
 }
 

From f6edcc40612d38093ec65ef8f0010cab331c9e35 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 14:52:46 +0000
Subject: [PATCH 078/105] Use a vectorized assembly function to handle
 remaining chunks less than vector wide.

---
 ggml-phi-knc.c | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 095241cda..9f9cf1f0d 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -107,6 +107,27 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
 }
 
+// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. uses masks to handle just the last run-through.
+inline static void GGML_F32x16_VEC_FMA_TAIL(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t items)
+{
+    uint32_t mask = (0x00000001 << items)-1;
+
+    __asm__ __volatile__ (
+                          "vprefetchnta\t(%[VEC1])\n\t"
+                          "vprefetchnta\t(%[VEC2])\n\t"
+                          "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"                  // Load our inital state from sum..
+                          "kmov\t%[MASK],%%k1\n\t"                            // Load a mask that we will use to just operate on part of a vector..
+                          "vmovaps\t\t(%[VEC1]),\t%%zmm1%{%%k1%}\n\t"         // Partially two vectors.
+                          "vmovaps\t\t(%[VEC2]),\t%%zmm2%{%%k1%}\n\t"
+                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0%{%%k1%}\n\t" // Perform a fused multiply add
+                          "vmovnraps\t\t%%zmm0,\t(%[RES])%{%%k1%}\n\t"        // save our results.
+                          : [RES]  "+r" (sumvec)
+                          : [VEC1]  "r"  (mvec1),
+                            [VEC2]  "r"  (mvec2),
+                            [MASK]  "r"  (mask)
+                          : "zmm0", "zmm1", "zmm2", "k1", "memory");
+}
+
 // NOTE: x and y inputs must be __attribute__((aligned(64)));
 void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc)
 {
@@ -118,26 +139,11 @@ void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restri
 
     GGML_F32x16_VEC_FMA((const float32x16_t *)x, (const float32x16_t *)y, &sum, np/GGML_F32_EPR, 1);
 
-    // add the leftovers, that could not be handled by the vector loop.
-    if ( n - np != 0 )
-        {
-            // our extended last part of x.
-            float32x16_t v1;
-            GGML_F32x16_VEC_ZERO(&v1);
-            // our extended last part of y.
-            float32x16_t v2;
-            GGML_F32x16_VEC_ZERO(&v2);
-
-            memcpy(&v1, &x[np], (n - np)*sizeof(float));
-            memcpy(&v2, &y[np], (n - np)*sizeof(float));
-
-            GGML_F32x16_VEC_FMA(&v1,
-                                &v2,
-                                &sum, 1, 0);
-        }
+    // add the leftovers, that could not be handled by the whole vector loop.
+    if ( n - np != 0 ) GGML_F32x16_VEC_FMA_TAIL((const float32x16_t *)&x[np], (const float32x16_t *)&y[np], &sum, n-np);
 
     // reduce sum, and store it in s.
-    for (uint32_t i=0; i <GGML_F32_EPR; ++i)
+    for (uint32_t i=0; i < GGML_F32_EPR; ++i)
         *s+=((float *)&sum)[i];
 
 }

From b00607d1ab90c7f4aedbcf933bbcabc0502e080a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 15:52:35 +0000
Subject: [PATCH 079/105] use vbroadcastss in place of vbroadcast32x4.

---
 ggml-phi-knc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 9f9cf1f0d..a273c9525 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -1,5 +1,5 @@
 /* Xeon PHI IMCI support. */
-/* formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
+/* Formatted by using emacs, with (M-x set-variable RET c-basic-offset RET 4 RET) executed. */
 /* Formatted by using emacs, with (M-x set-variable RET indent-tabs-mode RET nil RET) executed. */
 
 #include <stdint.h>
@@ -35,7 +35,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
 // Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
 inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
 {
-    uint8_t zero[4] __attribute__((aligned(64))) = {0,0,0,0};
+    uint8_t zero = 0;
 
     __asm__ __volatile__ (
                           "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
@@ -43,7 +43,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
                           "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
                           "jne\t4f\n\t"
-                          "vbroadcastf32x4\t%[Z]%{uint8%},\t%%zmm0\n\t" // if so, use an upscaling operator to do it.
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // if so, use an upscaling operator to do it.
                           "vprefetchnta\t(%%r10)\n\t"
                           "vprefetchnta\t(%%r12)\n\t"
                           "vprefetch1\t128(%%r10)\n\t"

From 0ff7d5dd1aa0b8e019db7023b644e9e4b9262faf Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 16:14:28 +0000
Subject: [PATCH 080/105] perform better prefetches, and invert the test of our
 clear flag for clarity.

---
 ggml-phi-knc.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index a273c9525..9d7a34199 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -38,11 +38,20 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
     uint8_t zero = 0;
 
     __asm__ __volatile__ (
-                          "mov\t%[ITER],%%r8\n\t"                       // how many register sized chunks are we responsible for
-                          "mov\t%[VEC1],%%r10\n\t"                      // where do we start work in mvec1?
-                          "mov\t%[VEC2],%%r12\n\t"                      // where do we start work in mvec2?
-                          "cmp\t$1,%[CLR]\n\t"                          // should we clear the sum before we start?
-                          "jne\t4f\n\t"
+                          "vprefetchenta\t(%[RES])\n\t"
+                          "vprefetch0\t(%[VEC1])\n\t"
+                          "vprefetch1\t64(%[VEC1])\n\t"
+                          "vprefetch0\t128(%[VEC1])\n\t"
+                          "vprefetch1\t192(%[VEC1])\n\t"
+                          "vprefetch0\t(%[VEC2])\n\t"
+                          "vprefetch1\t64(%[VEC2])\n\t"
+                          "vprefetch0\t128(%[VEC2])\n\t"
+                          "vprefetch1\t192(%[VEC2])\n\t"
+                          "mov\t%[ITER],%%r8\n\t"                       // How many vector sized chunks we are responsible for.
+                          "mov\t%[VEC1],%%r10\n\t"                      // Where do we start work in mvec1?
+                          "mov\t%[VEC2],%%r12\n\t"                      // Where do we start work in mvec2?
+                          "cmp\t$0,%[CLR]\n\t"                          // Should we clear the sum before we start?
+                          "jz\t4f\n\t"
                           "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // if so, use an upscaling operator to do it.
                           "vprefetchnta\t(%%r10)\n\t"
                           "vprefetchnta\t(%%r12)\n\t"

From 650094e17b040f5fe73ff6ee03645d51a663a84a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 16:28:53 +0000
Subject: [PATCH 081/105] remove useless prefetches.

---
 ggml-phi-knc.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 9d7a34199..00bc860f1 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -47,22 +47,12 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vprefetch1\t64(%[VEC2])\n\t"
                           "vprefetch0\t128(%[VEC2])\n\t"
                           "vprefetch1\t192(%[VEC2])\n\t"
-                          "mov\t%[ITER],%%r8\n\t"                       // How many vector sized chunks we are responsible for.
+                          "mov\t%[ITER],%%r8\n\t"                       // How many vector sized chunks are we responsible for?
                           "mov\t%[VEC1],%%r10\n\t"                      // Where do we start work in mvec1?
                           "mov\t%[VEC2],%%r12\n\t"                      // Where do we start work in mvec2?
                           "cmp\t$0,%[CLR]\n\t"                          // Should we clear the sum before we start?
                           "jz\t4f\n\t"
                           "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // if so, use an upscaling operator to do it.
-                          "vprefetchnta\t(%%r10)\n\t"
-                          "vprefetchnta\t(%%r12)\n\t"
-                          "vprefetch1\t128(%%r10)\n\t"
-                          "vprefetch1\t128(%%r12)\n\t"
-                          "vprefetch1\t256(%%r10)\n\t"
-                          "vprefetch1\t256(%%r12)\n\t"
-                          "vprefetch1\t384(%%r10)\n\t"
-                          "vprefetch1\t384(%%r12)\n\t"
-                          "vprefetch1\t512(%%r10)\n\t"
-                          "vprefetch1\t512(%%r12)\n\t"
                           "jmp\t1f\n\t"
                           "4:\n\t"
                           "vprefetch0\t(%[RES])\n\t"

From 7966c8e443280e67fb01edf08c1e8aceb3cdd1c2 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 16:50:39 +0000
Subject: [PATCH 082/105] spacing and comment changes.

---
 ggml-phi-knc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 00bc860f1..91289bd04 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -29,7 +29,6 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
                           : [RES]  "+m"  (*target)
                           : [Z]     "m"  (zero)
                           : "zmm8", "memory");
-
 }
 
 // Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
@@ -52,11 +51,11 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "mov\t%[VEC2],%%r12\n\t"                      // Where do we start work in mvec2?
                           "cmp\t$0,%[CLR]\n\t"                          // Should we clear the sum before we start?
                           "jz\t4f\n\t"
-                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // if so, use an upscaling operator to do it.
+                          "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // If so, use an upscaling operator to clear our sum.
                           "jmp\t1f\n\t"
                           "4:\n\t"
                           "vprefetch0\t(%[RES])\n\t"
-                          "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // otherwise, load our inital state from sum..
+                          "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // Otherwise, load our inital state from sum..
                           "vprefetchnta\t(%%r10)\n\t"
                           "vprefetchnta\t(%%r12)\n\t"
                           "1:\n\t"
@@ -83,7 +82,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "jmp\t1b\n\t"                                 // Jump back to the start of the loop
-                          "6:\n\t"                                      // we know we are near the tail. handle 2, 1, and 0 cases.
+                          "6:\n\t"                                      // We know we are near the tail. handle 2, 1, and 0 cases.
                           "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
                           "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
                           "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
@@ -96,7 +95,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
                           "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "2:\n\t"                                      // Label for loop end
-                          "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // save our results.
+                          "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // Save our results.
                           : [RES]  "+r" (sumvec)
                           : [ITER]  "r"  (iterations),
                             [VEC1]  "r"  (mvec1),

From 7e44eabe0f6ef666d581cb8041a7f825da7bc34a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 17:03:41 +0000
Subject: [PATCH 083/105] move sub earlier, and move the compare of iterations
 to outside, and at the end of the loop.

---
 ggml-phi-knc.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 91289bd04..5e400849a 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -52,18 +52,19 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "cmp\t$0,%[CLR]\n\t"                          // Should we clear the sum before we start?
                           "jz\t4f\n\t"
                           "vbroadcastss\t%[Z]%{uint8%},\t%%zmm0\n\t"    // If so, use an upscaling operator to clear our sum.
-                          "jmp\t1f\n\t"
+                          "jmp\t5f\n\t"
                           "4:\n\t"
                           "vprefetch0\t(%[RES])\n\t"
                           "vmovaps\t\t(%[RES]),\t%%zmm0\n\t"            // Otherwise, load our inital state from sum..
                           "vprefetchnta\t(%%r10)\n\t"
                           "vprefetchnta\t(%%r12)\n\t"
-                          "1:\n\t"
+                          "5:\n\t"
                           "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
                           "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
+                          "1:\n\t"
+                          "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
                           "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
                           "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
-                          "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
                           "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
                           "vprefetchnta\t192(%%r12)\n\t"
                           "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
@@ -81,7 +82,8 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
-                          "jmp\t1b\n\t"                                 // Jump back to the start of the loop
+                          "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
+                          "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
                           "6:\n\t"                                      // We know we are near the tail. handle 2, 1, and 0 cases.
                           "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
                           "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)

From 21a1e740c2049d0f258c84db7ce871ae0596d5e1 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 17:07:27 +0000
Subject: [PATCH 084/105] fix loop.

---
 ggml-phi-knc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 5e400849a..21a5fa5d4 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -83,7 +83,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
-                          "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
+                          "jge\t1b\n\t"                                 // If there still three or more iterations left, loop.
                           "6:\n\t"                                      // We know we are near the tail. handle 2, 1, and 0 cases.
                           "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
                           "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)

From 806472787d4b9b6e1ebcd5e7bfe7cd801270f1d6 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 19:33:58 +0000
Subject: [PATCH 085/105] use values inside of the loop as soon as we have
 them.

---
 ggml-phi-knc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 21a5fa5d4..89e1ef815 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -67,10 +67,12 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
                           "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)
                           "vprefetchnta\t192(%%r12)\n\t"
+                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
                           "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
                           "vprefetch1\t320(%%r10)\n\t"                  // prefetch the block after the block after the next float32x16_t block (320 bytes ahead)
                           "vprefetch1\t320(%%r12)\n\t"
+                          "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
                           "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
                           "vprefetch1\t576(%%r10)\n\t"
@@ -79,8 +81,6 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vprefetch1\t704(%%r12)\n\t"
                           "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
                           "add\t$192,\t%%r12\n\t"
-                          "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
-                          "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
                           "jge\t1b\n\t"                                 // If there still three or more iterations left, loop.

From 4a3c42c82cee12a8393e0879c6a3c4cd6f283665 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 20:30:56 +0000
Subject: [PATCH 086/105] correct a comment, and use jz when comparing to zero.

---
 ggml-phi-knc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 89e1ef815..6670ee8fb 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -86,12 +86,12 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "jge\t1b\n\t"                                 // If there still three or more iterations left, loop.
                           "6:\n\t"                                      // We know we are near the tail. handle 2, 1, and 0 cases.
                           "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
-                          "je\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
+                          "jz\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
                           "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
                           "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
                           "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
                           "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
-                          "je\t2f\n\t"                                  // Jump to label 3 if one (end of loop)
+                          "je\t2f\n\t"                                  // Jump to label 2 if one (end of loop)
                           // No compare. we must be two.
                           "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
                           "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"

From a82ada7dcda8fcee751352a0db1fd386db4bc5d2 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Fri, 10 May 2024 21:57:16 +0000
Subject: [PATCH 087/105] comment clarification.

---
 ggml-phi-knc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 6670ee8fb..5f9eb70c4 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -82,8 +82,8 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
                           "add\t$192,\t%%r12\n\t"
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
-                          "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
-                          "jge\t1b\n\t"                                 // If there still three or more iterations left, loop.
+                          "cmp\t$3,\t%%r8\n\t"                          // Compare iteration count to three.
+                          "jge\t1b\n\t"                                 // If there three or more iterations left, loop.
                           "6:\n\t"                                      // We know we are near the tail. handle 2, 1, and 0 cases.
                           "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
                           "jz\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)

From 3156e639bf87ff8c33009bb8bb1b373ca4a7ee43 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 11:07:16 +0000
Subject: [PATCH 088/105] change from handling three iterations per loop to
 four.

---
 ggml-phi-knc.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 5f9eb70c4..9b53d876d 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -31,7 +31,7 @@ inline static void GGML_F32x16_VEC_ZERO(float32x16_t *target)
                           : "zmm8", "memory");
 }
 
-// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. optionally clear the sum before starting. 
+// Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. Optionally clear the sum before starting.
 inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x16_t *mvec2, float32x16_t *sumvec, size_t iterations, int clear)
 {
     uint8_t zero = 0;
@@ -59,8 +59,8 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vprefetchnta\t(%%r10)\n\t"
                           "vprefetchnta\t(%%r12)\n\t"
                           "5:\n\t"
-                          "cmp\t$3,\t%%r8\n\t"                          // Compare iterations to three.
-                          "jnae\t6f\n\t"                                // If there are not three iterations left, jump to label 6.
+                          "cmp\t$4,\t%%r8\n\t"                          // Compare iterations to four.
+                          "jnae\t6f\n\t"                                // If there are not four iterations left, jump to label 6.
                           "1:\n\t"
                           "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
                           "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
@@ -79,12 +79,15 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vprefetch1\t576(%%r12)\n\t"
                           "vprefetch1\t704(%%r10)\n\t"
                           "vprefetch1\t704(%%r12)\n\t"
-                          "add\t$192,\t%%r10\n\t"                       // Move to the next float32x16_t block (192 bytes ahead)
-                          "add\t$192,\t%%r12\n\t"
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
-                          "cmp\t$3,\t%%r8\n\t"                          // Compare iteration count to three.
-                          "jge\t1b\n\t"                                 // If there three or more iterations left, loop.
-                          "6:\n\t"                                      // We know we are near the tail. handle 2, 1, and 0 cases.
+                          "vmovaps\t\t192(%%r10),\t%%zmm7\n\t"          // Load two vectors.
+                          "vmovaps\t\t192(%%r12),\t%%zmm8\n\t"
+                          "vfmadd231ps\t%%zmm7,\t%%zmm8,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "add\t$256,\t%%r10\n\t"                       // Move to the next 4xfloat32x16_t block (256 bytes ahead)
+                          "add\t$256,\t%%r12\n\t"
+                          "cmp\t$4,\t%%r8\n\t"                          // Compare iteration count to four.
+                          "jge\t1b\n\t"                                 // If there are four or more iterations left, loop.
+                          "6:\n\t"                                      // We know we are near the tail. handle 3, 2, 1, and 0 cases.
                           "cmp\t$0,\t%%r8\n\t"                          // Compare iterations to zero
                           "jz\t2f\n\t"                                  // Jump to label 2 if zero (end of loop)
                           "cmp\t$1,\t%%r8\n\t"                          // Compare iterations to one
@@ -92,10 +95,14 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
                           "vfmadd231ps\t%%zmm1,\t%%zmm2,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "je\t2f\n\t"                                  // Jump to label 2 if one (end of loop)
-                          // No compare. we must be two.
+                          "cmp\t$2,\t%%r8\n\t"                          // Compare iterations to two
                           "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
                           "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
                           "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          // No compare. we must be three.
+                          "vmovaps\t\t64(%%r10),\t%%zmm5\n\t"           // Load two vectors.
+                          "vmovaps\t\t64(%%r12),\t%%zmm6\n\t"
+                          "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "2:\n\t"                                      // Label for loop end
                           "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // Save our results.
                           : [RES]  "+r" (sumvec)
@@ -104,7 +111,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                             [VEC2]  "r"  (mvec2),
                             [CLR]   "r"  (clear),
                             [Z]     "m"  (zero)
-                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "cc", "memory", "r8", "r10", "r12");
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "memory", "r8", "r10", "r12");
 }
 
 // Multiply each item in mvec1 with the corresponding item in mvec2, adding the result to the corresponding item in sum. uses masks to handle just the last run-through.

From fba57c125cdbcf244119b470cf8642f1b196a44c Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 11:11:15 +0000
Subject: [PATCH 089/105] subtract the correct amount.

---
 ggml-phi-knc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index 9b53d876d..e9b5352b3 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -62,7 +62,7 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "cmp\t$4,\t%%r8\n\t"                          // Compare iterations to four.
                           "jnae\t6f\n\t"                                // If there are not four iterations left, jump to label 6.
                           "1:\n\t"
-                          "sub\t$3,\t%%r8\n\t"                          // Decrement iterations
+                          "sub\t$4,\t%%r8\n\t"                          // Decrement iterations
                           "vmovaps\t\t(%%r10),\t%%zmm1\n\t"             // Load two vectors.
                           "vmovaps\t\t(%%r12),\t%%zmm2\n\t"
                           "vprefetchnta\t192(%%r10)\n\t"                // prefetch the next float32x16_t block (192 bytes ahead)

From fa0226c8df70da4030cdb57ab84faaf7974092e1 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 11:27:52 +0000
Subject: [PATCH 090/105] look at the right final memory location.

---
 ggml-phi-knc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index e9b5352b3..add8be2da 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -100,8 +100,8 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
                           "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
                           // No compare. we must be three.
-                          "vmovaps\t\t64(%%r10),\t%%zmm5\n\t"           // Load two vectors.
-                          "vmovaps\t\t64(%%r12),\t%%zmm6\n\t"
+                          "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"           // Load two vectors.
+                          "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "2:\n\t"                                      // Label for loop end
                           "vmovnraps\t\t%%zmm0,\t(%[RES])\n\t"          // Save our results.

From b34575b1f323b70862a7f121ee390b4566b8801e Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 12:53:23 +0000
Subject: [PATCH 091/105] add missing jump.

---
 ggml-phi-knc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc.c b/ggml-phi-knc.c
index add8be2da..c4cc49724 100644
--- a/ggml-phi-knc.c
+++ b/ggml-phi-knc.c
@@ -99,8 +99,9 @@ inline static void GGML_F32x16_VEC_FMA(const float32x16_t *mvec1, const float32x
                           "vmovaps\t\t64(%%r10),\t%%zmm3\n\t"           // Load two vectors.
                           "vmovaps\t\t64(%%r12),\t%%zmm4\n\t"
                           "vfmadd231ps\t%%zmm3,\t%%zmm4,\t%%zmm0\n\t"   // Perform a fused multiply add
+                          "je\t2f\n\t"                                  // Jump to label 2 if two (end of loop)
                           // No compare. we must be three.
-                          "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"           // Load two vectors.
+                          "vmovaps\t\t128(%%r10),\t%%zmm5\n\t"          // Load two vectors.
                           "vmovaps\t\t128(%%r12),\t%%zmm6\n\t"
                           "vfmadd231ps\t%%zmm5,\t%%zmm6,\t%%zmm0\n\t"   // Perform a fused multiply add
                           "2:\n\t"                                      // Label for loop end

From 6c4e687b85e91240a07d36b4d1fb2701777bfcb0 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 13:26:00 +0000
Subject: [PATCH 092/105] spacing changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 78 ++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index db82653b4..9c1414e43 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -201,67 +201,67 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
     uint8_t bit5 = 0x10;
 
     __asm__ __volatile__ (
-                          "vprefetch0\t(%[SRC1])\n\t"                       // Issue our memory requests first thing.
+                          "vprefetch0\t(%[SRC1])\n\t"                          // Issue our memory requests first thing.
                           "vprefetch0\t(%[SRC4])\n\t"
                           "vprefetchenta\t(%[DST])\n\t"
-                          "mov\t%[SRC4],\t%%r9\n\t"                         // Load the address of the head of our 4-bit list.
-                          "mov\t%[DST],\t%%r8\n\t"                          // Load the address of the head of our destination list.
-                          "mov\t$0,%%ecx\n\t"                               // Initialize our counter.
-                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"     // Load our mask.
-                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"      // Load the bit we want to add (conditionally).
-                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"        // Select which bit we want to test for. Start with bit 1.
-                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"      // Load 16 sets of 8 packed single bits.
-                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"    // Load the next 16 sets of 8 packed single bits.
+                          "mov\t%[SRC4],\t%%r9\n\t"                            // Load the address of the head of our 4-bit list.
+                          "mov\t%[DST],\t%%r8\n\t"                             // Load the address of the head of our destination list.
+                          "mov\t$0,%%ecx\n\t"                                  // Initialize our counter.
+                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"        // Load our mask.
+                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"         // Load the bit we want to add (conditionally).
+                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"           // Select which bit we want to test for. Start with bit 1.
+                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"         // Load 16 sets of 8 packed single bits.
+                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"       // Load the next 16 sets of 8 packed single bits.
 
                           "1:\n\t"
-                          "inc\t%%ecx\n\t"                                  // We are in the loop. increment the counter.
+                          "inc\t%%ecx\n\t"                                     // We are in the loop. increment the counter.
 
-                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // Test to see if our selected bit is set.
-                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"               // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"               // Test to see if our selected bit is set.
 
-                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"            // Apply a mask, storing the first set of four bits into a vector.
-                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // Save our result.
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"      // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"               // Apply a mask, storing the first set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"          // Save our result.
 
-                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vprefetch1\t32(%%r9)\n\t"                        // Pull the next set of 4 bit sequences into the L2 cache.
-                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // Apply a mask, storing the next set of four bits into a vector.
-                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // Save our result.
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vprefetch1\t32(%%r9)\n\t"                           // Pull the next set of 4 bit sequences into the L2 cache.
+                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"               // Apply a mask, storing the next set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"        // Save our result.
                           
                           "add\t$32,\t%%r8\n\t"
                           "cmp\t$4,\t%%ecx\n\t"
 
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select the next bit to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                   // Select the next bit to test for.
                           
-                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"            // Perform our test.
-                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"            // Perform our test.
-                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                // Load our even 4 bit sequence.
-                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                // Load our next even 4 bit sequence.
-                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
-                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // Save our result.
-                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"     // Save our result.
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"               // Perform our test.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"               // Perform our test.
+                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                   // Load our even 4 bit sequence.
+                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                   // Load our next even 4 bit sequence.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"          // Save our result.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"        // Save our result.
                           "vprefetchenta\t32(%%r8)\n\t"
 
                           "je\t2f\n\t"
 
                           "vprefetch0\t32(%%r9)\n\t"
                           "vprefetch1\t96(%%r9)\n\t"
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                // Select the next bit to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                   // Select the next bit to test for.
                           "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r8\n\t"
                           "jmp\t1b\n\t"
                           "2:"
-                          : [DST]  "+r" (dst)
-                          : [SRC4]  "r" (q4),
-                            [SRC1]  "r" (q1),
-                            [MASK]  "m" (lowmask),
-                            [M]     "m" (m),
-                            [BIT5]  "m" (bit5)
+                          : [DST]   "+r" (dst)
+                          : [SRC4]   "r" (q4),
+                            [SRC1]   "r" (q1),
+                            [MASK]   "m" (lowmask),
+                            [M]      "m" (m),
+                            [BIT5]   "m" (bit5)
                           : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "memory");
 }
   

From 9d7f967e888581a7509d3630c5906d88d0bf5351 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 13:35:50 +0000
Subject: [PATCH 093/105] spacing changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 66 ++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 9c1414e43..a6072f665 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -201,57 +201,57 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
     uint8_t bit5 = 0x10;
 
     __asm__ __volatile__ (
-                          "vprefetch0\t(%[SRC1])\n\t"                          // Issue our memory requests first thing.
+                          "vprefetch0\t(%[SRC1])\n\t"                         // Issue our memory requests first thing.
                           "vprefetch0\t(%[SRC4])\n\t"
                           "vprefetchenta\t(%[DST])\n\t"
-                          "mov\t%[SRC4],\t%%r9\n\t"                            // Load the address of the head of our 4-bit list.
-                          "mov\t%[DST],\t%%r8\n\t"                             // Load the address of the head of our destination list.
-                          "mov\t$0,%%ecx\n\t"                                  // Initialize our counter.
-                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"        // Load our mask.
-                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"         // Load the bit we want to add (conditionally).
-                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"           // Select which bit we want to test for. Start with bit 1.
-                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"         // Load 16 sets of 8 packed single bits.
-                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"       // Load the next 16 sets of 8 packed single bits.
+                          "mov\t%[SRC4],\t%%r9\n\t"                           // Load the address of the head of our 4-bit list.
+                          "mov\t%[DST],\t%%r8\n\t"                            // Load the address of the head of our destination list.
+                          "mov\t$0,%%ecx\n\t"                                 // Initialize our counter.
+                          "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"       // Load our mask.
+                          "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"        // Load the bit we want to add (conditionally).
+                          "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"          // Select which bit we want to test for. Start with bit 1.
+                          "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"        // Load 16 sets of 8 packed single bits.
+                          "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"      // Load the next 16 sets of 8 packed single bits.
 
                           "1:\n\t"
-                          "inc\t%%ecx\n\t"                                     // We are in the loop. increment the counter.
+                          "inc\t%%ecx\n\t"                                    // We are in the loop. increment the counter.
 
-                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"               // Test to see if our selected bit is set.
-                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"               // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
 
-                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"      // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"               // Apply a mask, storing the first set of four bits into a vector.
-                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"          // Save our result.
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"     // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"              // Apply a mask, storing the first set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"         // Save our result.
 
-                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vprefetch1\t32(%%r9)\n\t"                           // Pull the next set of 4 bit sequences into the L2 cache.
-                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"               // Apply a mask, storing the next set of four bits into a vector.
-                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"        // Save our result.
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vprefetch1\t32(%%r9)\n\t"                          // Pull the next set of 4 bit sequences into the L2 cache.
+                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"              // Apply a mask, storing the next set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"       // Save our result.
                           
                           "add\t$32,\t%%r8\n\t"
                           "cmp\t$4,\t%%ecx\n\t"
 
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                   // Select the next bit to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
                           
-                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"               // Perform our test.
-                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"               // Perform our test.
-                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                   // Load our even 4 bit sequence.
-                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                   // Load our next even 4 bit sequence.
-                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
-                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"            // Turn on bit 5 for all values that passed the prior test.
-                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"          // Save our result.
-                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"        // Save our result.
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Perform our test.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Perform our test.
+                          "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                  // Load our even 4 bit sequence.
+                          "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                  // Load our next even 4 bit sequence.
+                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"         // Save our result.
+                          "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"       // Save our result.
                           "vprefetchenta\t32(%%r8)\n\t"
 
                           "je\t2f\n\t"
 
                           "vprefetch0\t32(%%r9)\n\t"
                           "vprefetch1\t96(%%r9)\n\t"
-                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                   // Select the next bit to test for.
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
                           "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r8\n\t"
                           "jmp\t1b\n\t"

From 0a0bb9b7db8c68e753acf038c35cf435a0e990af Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 14:02:36 +0000
Subject: [PATCH 094/105] introduce r10 and r11, for vloadunpackhd.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index a6072f665..4f2ff837c 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -204,8 +204,10 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vprefetch0\t(%[SRC1])\n\t"                         // Issue our memory requests first thing.
                           "vprefetch0\t(%[SRC4])\n\t"
                           "vprefetchenta\t(%[DST])\n\t"
-                          "mov\t%[SRC4],\t%%r9\n\t"                           // Load the address of the head of our 4-bit list.
                           "mov\t%[DST],\t%%r8\n\t"                            // Load the address of the head of our destination list.
+                          "mov\t%[SRC4],\t%%r9\n\t"                           // Load the address of the head of our 4-bit list into r9, for vloadunpackld.
+                          "mov\t%[SRC4],\t%%r10\n\t"                          // Load the address of the head of our 4-bit list into r10-r11, for vloadunpackhd.
+                          "mov\t%[SRC4],\t%%r11\n\t"
                           "mov\t$0,%%ecx\n\t"                                 // Initialize our counter.
                           "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"       // Load our mask.
                           "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"        // Load the bit we want to add (conditionally).
@@ -220,13 +222,13 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
 
                           "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"     // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm5\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r10)%{uint8%},\t%%zmm5\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"              // Apply a mask, storing the first set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"         // Save our result.
 
                           "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t32(%%r11)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vprefetch1\t32(%%r9)\n\t"                          // Pull the next set of 4 bit sequences into the L2 cache.
                           "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"              // Apply a mask, storing the next set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
@@ -237,8 +239,8 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
 
                           "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
                           
-                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Perform our test.
-                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Perform our test.
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
                           "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                  // Load our even 4 bit sequence.
                           "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                  // Load our next even 4 bit sequence.
                           "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
@@ -252,8 +254,10 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vprefetch0\t32(%%r9)\n\t"
                           "vprefetch1\t96(%%r9)\n\t"
                           "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
-                          "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r8\n\t"
+                          "add\t$32,\t%%r9\n\t"
+                          "add\t$32,\t%%r10\n\t"
+                          "add\t$32,\t%%r11\n\t"
                           "jmp\t1b\n\t"
                           "2:"
                           : [DST]   "+r" (dst)
@@ -262,7 +266,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                             [MASK]   "m" (lowmask),
                             [M]      "m" (m),
                             [BIT5]   "m" (bit5)
-                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "memory");
+                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "memory");
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.

From a1d0da669d83dbabca6045e655008809f7097837 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 14:24:30 +0000
Subject: [PATCH 095/105] rename label 1 to 3.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 4f2ff837c..5f8445b4b 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -215,7 +215,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vmovdqa32\t(%[SRC1])%{uint8%},\t%%zmm3\n\t"        // Load 16 sets of 8 packed single bits.
                           "vmovdqa32\t16(%[SRC1])%{uint8%},\t%%zmm4\n\t"      // Load the next 16 sets of 8 packed single bits.
 
-                          "1:\n\t"
+                          "3:\n\t"
                           "inc\t%%ecx\n\t"                                    // We are in the loop. increment the counter.
 
                           "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
@@ -258,7 +258,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r10\n\t"
                           "add\t$32,\t%%r11\n\t"
-                          "jmp\t1b\n\t"
+                          "jmp\t3b\n\t"
                           "2:"
                           : [DST]   "+r" (dst)
                           : [SRC4]   "r" (q4),

From 047defea41e7a9391bd6cfd21d6baca5c64d886c Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 17:56:10 +0000
Subject: [PATCH 096/105] rename some labels.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 5f8445b4b..f2ed1bfeb 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -119,19 +119,19 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
                           "cmp\t$48,%%r10\n\t"
                           "jl\t21f\n\t"
                           "add\t$64,%%r12\n\t"                                // Greater than 47.
-                          "jmp\t18f\n\t"
+                          "jmp\t14f\n\t"
                           "21:\n\t"
                           "add\t$64,%%r13\n\t"                                // Between 48 and 31.
-                          "jmp\t18f\n\t"
+                          "jmp\t14f\n\t"
                           "20:\n\t"                                           // Less than 32...
                           "cmp\t$16,%%r10\n\t"
-                          "jz\t18f\n\t"                                       // Zero.
-                          "jl\t23f\n\t"
+                          "jz\t14f\n\t"                                       // Zero.
+                          "jl\t13f\n\t"
                           "add\t$64,%%r14\n\t"                                // Between 32 and 15.
-                          "jmp\t18f\n\t"
-                          "23:\n\t"
+                          "jmp\t14f\n\t"
+                          "13:\n\t"
                           "add\t$64,%%r15\n\t"                                // Between 16 and zero.
-                          "18:\n\t"
+                          "14:\n\t"
                           "vbroadcastss\t%[SCALEY],\t%%zmm3\n\t"              // Load the scale factors coresponding to the two input vectors.
                           "vbroadcastss\t%[SCALEX]%{float16%},\t%%zmm4\n\t"
                           "vmulps\t%%zmm3,\t%%zmm4,\t%%zmm5\n\t"              // Prepare the factor we're going to multiply the result by..
@@ -315,7 +315,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
 
-        // FIXME: while comparing FMA output to the original output, the original had an error. Hunt it down.
+        // FIXME: While comparing FMA output to the original output, the original had an error. Hunt it down.
         GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned((const int8x16_t *)y[i].qs, q5, scales, x[i].d, y[i].d, &sums);
 
         const float dmin = GGML_PHI_FP16_TO_FP32(x[i].dmin) * y[i].d;

From 7fa2d73b0a9caa7ee203b90c41f27cf761b0c87f Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 19:02:48 +0000
Subject: [PATCH 097/105] relabel some other labels.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index f2ed1bfeb..036b7f990 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -115,15 +115,15 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
                           "mov\t%[SRC8],\t%%r12\n\t"
                           "mov\t%[OFFSET],\t%%r10\n\t"
                           "cmp\t$32,%%r10\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increased by 64.
-                          "jl\t20f\n\t"
+                          "jl\t10f\n\t"
                           "cmp\t$48,%%r10\n\t"
-                          "jl\t21f\n\t"
+                          "jl\t11f\n\t"
                           "add\t$64,%%r12\n\t"                                // Greater than 47.
                           "jmp\t14f\n\t"
-                          "21:\n\t"
+                          "11:\n\t"
                           "add\t$64,%%r13\n\t"                                // Between 48 and 31.
                           "jmp\t14f\n\t"
-                          "20:\n\t"                                           // Less than 32...
+                          "10:\n\t"                                           // Less than 32...
                           "cmp\t$16,%%r10\n\t"
                           "jz\t14f\n\t"                                       // Zero.
                           "jl\t13f\n\t"
@@ -238,9 +238,10 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "cmp\t$4,\t%%ecx\n\t"
 
                           "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
-                          
+
                           "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
                           "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
+
                           "vpsrld\t$4,\t%%zmm5,\t%%zmm6\n\t"                  // Load our even 4 bit sequence.
                           "vpsrld\t$4,\t%%zmm7,\t%%zmm8\n\t"                  // Load our next even 4 bit sequence.
                           "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.

From 653a565a027a8bd2f1d9552c75161b75b29c65ef Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 19:24:11 +0000
Subject: [PATCH 098/105] fill and increment r12 and r13.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 036b7f990..32d4ef22c 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -208,6 +208,8 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "mov\t%[SRC4],\t%%r9\n\t"                           // Load the address of the head of our 4-bit list into r9, for vloadunpackld.
                           "mov\t%[SRC4],\t%%r10\n\t"                          // Load the address of the head of our 4-bit list into r10-r11, for vloadunpackhd.
                           "mov\t%[SRC4],\t%%r11\n\t"
+                          "mov\t%[SRC4],\t%%r12\n\t"
+                          "mov\t%[SRC4],\t%%r13\n\t"
                           "mov\t$0,%%ecx\n\t"                                 // Initialize our counter.
                           "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"       // Load our mask.
                           "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"        // Load the bit we want to add (conditionally).
@@ -259,6 +261,8 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "add\t$32,\t%%r9\n\t"
                           "add\t$32,\t%%r10\n\t"
                           "add\t$32,\t%%r11\n\t"
+                          "add\t$32,\t%%r12\n\t"
+                          "add\t$32,\t%%r13\n\t"
                           "jmp\t3b\n\t"
                           "2:"
                           : [DST]   "+r" (dst)
@@ -267,7 +271,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                             [MASK]   "m" (lowmask),
                             [M]      "m" (m),
                             [BIT5]   "m" (bit5)
-                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "memory");
+                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "memory");
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.

From 9550ca516f1be50081481dc7021d08adee7a38a1 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 19:29:09 +0000
Subject: [PATCH 099/105] add missing vector.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 32d4ef22c..0814c7082 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -271,7 +271,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                             [MASK]   "m" (lowmask),
                             [M]      "m" (m),
                             [BIT5]   "m" (bit5)
-                          : "zmm0", "zmm1", "zmm2", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "memory");
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "memory");
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.

From efdb4116d12e2387b8adabd28218080ce74b3a34 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 19:39:53 +0000
Subject: [PATCH 100/105] make the offset of q4 available.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 0814c7082..ba0a73876 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -199,6 +199,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
     uint8_t lowmask = 0x0F;
     uint8_t m=1;
     uint8_t bit5 = 0x10;
+    uint64_t q4offset=((uint64_t) q4) & 0x3f;
 
     __asm__ __volatile__ (
                           "vprefetch0\t(%[SRC1])\n\t"                         // Issue our memory requests first thing.
@@ -230,7 +231,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"         // Save our result.
 
                           "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t32(%%r11)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t32(%%r11)%{uint8%},\t%%zmm7\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vprefetch1\t32(%%r9)\n\t"                          // Pull the next set of 4 bit sequences into the L2 cache.
                           "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"              // Apply a mask, storing the next set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
@@ -267,6 +268,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "2:"
                           : [DST]   "+r" (dst)
                           : [SRC4]   "r" (q4),
+                            [OFFSET] "m" (q4offset),
                             [SRC1]   "r" (q1),
                             [MASK]   "m" (lowmask),
                             [M]      "m" (m),

From 3449b0f359dd722e1d070a0b7bdf14e2d715dbf3 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 19:47:20 +0000
Subject: [PATCH 101/105] minor comment fixes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index ba0a73876..0eaeaa729 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -90,9 +90,9 @@ void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n)
     }
 }
 
-// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, it multiplies this I32x16 by a float, returning a F32x16.
+// This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, It multiplies this I32x16 by a float, returning a F32x16.
 // It loops 8 times. Well, actually four, with an unroll.
-// Handles q8 being unaligned.
+// Handles q4 being aligned incorrectly.
 // Requires q5 to be aligned.
 void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res)
 {

From 1072686dcf35de3197bbbce31a5558821d5c111a Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 19:48:53 +0000
Subject: [PATCH 102/105] load from identical addresses for low and high side.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 0eaeaa729..27fc16c9d 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -225,13 +225,13 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
 
                           "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm5\n\t"     // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t16(%%r10)%{uint8%},\t%%zmm5\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t(%%r10)%{uint8%},\t%%zmm5\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vpandd\t%%zmm0,\t%%zmm5,\t%%zmm6\n\t"              // Apply a mask, storing the first set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"         // Save our result.
 
                           "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t"   // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t32(%%r11)%{uint8%},\t%%zmm7\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r11)%{uint8%},\t%%zmm7\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                           "vprefetch1\t32(%%r9)\n\t"                          // Pull the next set of 4 bit sequences into the L2 cache.
                           "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"              // Apply a mask, storing the next set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.

From b23ab86eda1caf44d5a4f582089a4c8107999952 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 19:57:45 +0000
Subject: [PATCH 103/105] make offset available in a register.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 27fc16c9d..4c87bb00c 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -92,7 +92,7 @@ void GGML_PHI_FP32_TO_FP16_ROW(const float * x, ggml_fp16_t * y, int n)
 
 // This function perform two multiplies of an I8x16 and an I8x16 vector into two I16x16 vectors. Then it does an FMA on the scaled result of multiplying the two I16x16 vectors, adding the result into an I32x16. When done, It multiplies this I32x16 by a float, returning a F32x16.
 // It loops 8 times. Well, actually four, with an unroll.
-// Handles q4 being aligned incorrectly.
+// Handles q8 being aligned incorrectly.
 // Requires q5 to be aligned.
 void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_t *q8, uint8x16_t *q5, const uint8_t *scale, ggml_fp16_t scaleX, float scaleY, float32x16_t *res)
 {
@@ -192,7 +192,7 @@ void GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (const int8x16_
 }
 
 // Unpack 256 unsigned 5 bit values into an 8 bit vector.
-// Handles q4 not being aligned correctly.
+// Handles q4 being aligned incorrectly.
 // Requires dst to be aligned.
 void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint8x16_t * dst)
 {
@@ -211,6 +211,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "mov\t%[SRC4],\t%%r11\n\t"
                           "mov\t%[SRC4],\t%%r12\n\t"
                           "mov\t%[SRC4],\t%%r13\n\t"
+                          "mov\t%[OFFSET],\t%%r14\n\t"
                           "mov\t$0,%%ecx\n\t"                                 // Initialize our counter.
                           "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"       // Load our mask.
                           "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"        // Load the bit we want to add (conditionally).
@@ -273,7 +274,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                             [MASK]   "m" (lowmask),
                             [M]      "m" (m),
                             [BIT5]   "m" (bit5)
-                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "memory");
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory");
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.

From a20edbf3001f4c4e4c6bbb4ba489f4307eee6d67 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sat, 11 May 2024 20:28:47 +0000
Subject: [PATCH 104/105] do 2 rounds of 4, instead of 4 rounds of 2. and
 properly offset unalligned reads across a 64 byte boundary.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 65 +++++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 4c87bb00c..acae77965 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -207,12 +207,30 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vprefetchenta\t(%[DST])\n\t"
                           "mov\t%[DST],\t%%r8\n\t"                            // Load the address of the head of our destination list.
                           "mov\t%[SRC4],\t%%r9\n\t"                           // Load the address of the head of our 4-bit list into r9, for vloadunpackld.
-                          "mov\t%[SRC4],\t%%r10\n\t"                          // Load the address of the head of our 4-bit list into r10-r11, for vloadunpackhd.
+                          "mov\t%[SRC4],\t%%r10\n\t"                          // Load the address of the head of our 4-bit list into r10-r13, for vloadunpackhd.
                           "mov\t%[SRC4],\t%%r11\n\t"
                           "mov\t%[SRC4],\t%%r12\n\t"
                           "mov\t%[SRC4],\t%%r13\n\t"
                           "mov\t%[OFFSET],\t%%r14\n\t"
                           "mov\t$0,%%ecx\n\t"                                 // Initialize our counter.
+                          "cmp\t$32,%%r14\n\t"                                // Examine OFFSET, and decide which (if any) of the vloadunpackhd invocations needs to be increased by 64.
+                          "jl\t20f\n\t"
+                          "cmp\t$48,%%r14\n\t"
+                          "jl\t21f\n\t"
+                          "add\t$64,%%r10\n\t"                                // Greater than 47.
+                          "jmp\t24f\n\t"
+                          "21:\n\t"
+                          "add\t$64,%%r11\n\t"                                // Between 48 and 31.
+                          "jmp\t24f\n\t"
+                          "20:\n\t"                                           // Less than 32...
+                          "cmp\t$16,%%r14\n\t"
+                          "jz\t24f\n\t"                                       // Zero.
+                          "jl\t23f\n\t"
+                          "add\t$64,%%r12\n\t"                                // Between 32 and 15.
+                          "jmp\t24f\n\t"
+                          "23:\n\t"
+                          "add\t$64,%%r13\n\t"                                // Between 16 and zero.
+                          "24:\n\t"
                           "vpbroadcastd\t%[MASK]%{uint8%},\t%%zmm0\n\t"       // Load our mask.
                           "vpbroadcastd\t%[BIT5]%{uint8},\t%%zmm1\n\t"        // Load the bit we want to add (conditionally).
                           "vpbroadcastd\t%[M]%{uint8%},\t%%zmm2\n\t"          // Select which bit we want to test for. Start with bit 1.
@@ -239,8 +257,6 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"       // Save our result.
                           
                           "add\t$32,\t%%r8\n\t"
-                          "cmp\t$4,\t%%ecx\n\t"
-
                           "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
 
                           "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
@@ -254,6 +270,47 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"       // Save our result.
                           "vprefetchenta\t32(%%r8)\n\t"
 
+                          "vprefetch0\t32(%%r9)\n\t"
+                          "vprefetch1\t96(%%r9)\n\t"
+                          "add\t$32,\t%%r8\n\t"
+                          "add\t$32,\t%%r9\n\t"
+                          "add\t$32,\t%%r10\n\t"
+                          "add\t$32,\t%%r11\n\t"
+                          "add\t$32,\t%%r12\n\t"
+                          "add\t$32,\t%%r13\n\t"
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
+
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
+
+                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm9\n\t"     // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t(%%r12)%{uint8%},\t%%zmm9\n\t"    // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vpandd\t%%zmm0,\t%%zmm9,\t%%zmm10\n\t"             // Apply a mask, storing the first set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm10,%%zmm10%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm10%{uint8%},\t(%%r8)\n\t"        // Save our result.
+
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm11\n\t"  // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t16(%%r13)%{uint8%},\t%%zmm11\n\t" // Load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vprefetch1\t32(%%r9)\n\t"                          // Pull the next set of 4 bit sequences into the L2 cache.
+                          "vpandd\t%%zmm0,\t%%zmm11,\t%%zmm12\n\t"            // Apply a mask, storing the next set of four bits into a vector.
+                          "vpord\t%%zmm1,%%zmm12,%%zmm12%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm12%{uint8%},\t16(%%r8)\n\t"      // Save our result.
+
+                          "add\t$32,\t%%r8\n\t"
+                          "cmp\t$2,\t%%ecx\n\t"
+                          "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
+
+                          "vptestmd\t%%zmm3,\t%%zmm2,\t%%k1\n\t"              // Test to see if our selected bit is set.
+                          "vptestmd\t%%zmm4,\t%%zmm2,\t%%k2\n\t"              // Test to see if our selected bit is set.
+
+                          "vpsrld\t$4,\t%%zmm9,\t%%zmm10\n\t"                 // Load our even 4 bit sequence.
+                          "vpsrld\t$4,\t%%zmm11,\t%%zmm12\n\t"                // Load our next even 4 bit sequence.
+                          "vpord\t%%zmm1,%%zmm10,%%zmm10%{%%k1%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vpord\t%%zmm1,%%zmm12,%%zmm12%{%%k2%}\n\t"         // Turn on bit 5 for all values that passed the prior test.
+                          "vmovdqa32\t\t%%zmm10%{uint8%},\t(%%r8)\n\t"        // Save our result.
+                          "vmovdqa32\t\t%%zmm12%{uint8%},\t16(%%r8)\n\t"      // Save our result.
+                          "vprefetchenta\t32(%%r8)\n\t"
+
                           "je\t2f\n\t"
 
                           "vprefetch0\t32(%%r9)\n\t"
@@ -274,7 +331,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                             [MASK]   "m" (lowmask),
                             [M]      "m" (m),
                             [BIT5]   "m" (bit5)
-                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory");
+                          : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory");
 }
   
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.

From 0add3107f7cbd6cf021cffed4804beb8859290a6 Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Sun, 12 May 2024 09:36:08 +0000
Subject: [PATCH 105/105] spacing changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index acae77965..754366185 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -255,7 +255,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                           "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"              // Apply a mask, storing the next set of four bits into a vector.
                           "vpord\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"           // Turn on bit 5 for all values that passed the prior test.
                           "vmovdqa32\t\t%%zmm8%{uint8%},\t16(%%r8)\n\t"       // Save our result.
-                          
+
                           "add\t$32,\t%%r8\n\t"
                           "vpslld\t$1,\t%%zmm2,\t%%zmm2\n\t"                  // Select the next bit to test for.
 
@@ -333,7 +333,7 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                             [BIT5]   "m" (bit5)
                           : "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "cc", "ecx", "k1", "k2", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory");
 }
-  
+
 // A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
 // Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {