From 90498c1181b12273335fc4eb460f75acf75907ac Mon Sep 17 00:00:00 2001
From: Julia Longtin <julia.longtin@gmail.com>
Date: Tue, 2 Apr 2024 16:55:40 +0000
Subject: [PATCH] minor changes.

---
 ggml-phi-knc-dot_q5_K_q8_K.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/ggml-phi-knc-dot_q5_K_q8_K.c b/ggml-phi-knc-dot_q5_K_q8_K.c
index 1145dfff7..02545057f 100644
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@@ -21,13 +21,9 @@
 #define GGML_F32_EPR 16
 
 /* we force an alignment, because i haven't written unaligned forms of the assembly functions, yet.. */
-typedef float float32x8_t __attribute__((vector_size (32)));
 typedef float float32x16_t __attribute__((vector_size (64), aligned(64)));
 typedef int8_t int8x16_t __attribute__((vector_size (16), aligned(16)));
 typedef uint8_t uint8x16_t __attribute__((vector_size (16), aligned(16)));
-typedef int16_t int16x8_t __attribute__((vector_size (16)));
-typedef int16_t int16x16_t __attribute__((vector_size (32)));
-typedef int32_t int32x8_t __attribute__((vector_size (32)));
 typedef int32_t int32x16_t __attribute__((vector_size (64), aligned(64)));
 
 /* A forward declaration, to keep GCC happy. */
@@ -168,6 +164,8 @@ inline static void GGML_5bit_Unpack (const uint8x16_t * q4, const uint8_t * q1,
 			);
 }
   
+// A function for getting the dot product of two vectors, one of 5 bit resolution, and one of 8.
+// Used during inference, if your model prints "llama_model_loader: - type q5_K:  XXX tensors", and XXX is not zero. :)
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
 
   /* interpret X and Y as vectors. */
@@ -207,6 +205,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     // combine our 4 and 1 bit vector sets into an 8 bit value.
     GGML_5bit_Unpack(q4copyvec, x[i].qh, aux8);
 
+    // extract scales and mins..
     memcpy(utmp, x[i].scales, 12);
     utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
     const uint32_t uaux = utmp[1] & kmask1;
@@ -220,24 +219,17 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     GGML_I32x16_VEC_ZERO(&aux32);
 
-    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
-    int is = 0;
-    for (int j = 0; j < QK_K/32; ++j) {
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[j*2])[l] = q8[l] * a[l];
-      q8 += 16; a += 16;
-      for (int l = 0; l < 16; ++l) ((int16_t *)&aux16[(j*2)+1])[l] = q8[l] * a[l];
-      q8 += 16; a += 16;
-    }
-
     // FIXME: while comparing FMA output to the original output, the original had an error. hunt it down.
     GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16(q8copy, aux8, scales, &aux32);
 
+    int sumi = 0;
+    for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
     const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
     for (int l = 0; l < 16; ++l) ((float *)&sums)[l] += d * ((int32_t *)&aux32)[l];
     const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
     sumf -= dmin * sumi;
   }
-  
+
   for (int l = 0; l < 16; ++l) sumf += ((float *)&sums)[l];
   *s = sumf;
 }