fix an offset error, and get rid of tabs.

2024-04-22 18:29:31 +00:00 · 2024-04-22 18:29:31 +00:00 · 4fb1547ba6
commit 4fb1547ba6
parent e37b7f8497
1 changed files with 91 additions and 91 deletions
--- a/ggml-phi-knc-dot_q5_K_q8_K.c
+++ b/ggml-phi-knc-dot_q5_K_q8_K.c
@ -208,8 +208,8 @@ void GGML_5bit_Unpack_Unaligned (const uint8x16_t * q4, const uint8_t * q1, uint
                          "vpord\t%%zmm1,%%zmm6,%%zmm6%{%%k1%}\n\t"         // turn on bit 5 for all values that passed the prior test.
                          "vmovdqa32\t\t%%zmm6%{uint8%},\t(%%r8)\n\t"       // save our result.
-                          "vloadunpackld\t\t(%%r9)%{uint8%},\t%%zmm7\n\t"   // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackld\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
-                          "vloadunpackhd\t\t16(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
+                          "vloadunpackhd\t\t32(%%r9)%{uint8%},\t%%zmm7\n\t" // load our odd 4 bit sequences. note that it loads two 4 bit sequences into each zmm value.
                          "vprefetch1\t32(%%r9)\n\t"                        // pull the next set of 4 bit sequences into the L2 cache.
                          "vpandd\t%%zmm0,\t%%zmm7,\t%%zmm8\n\t"            // apply a mask, storing the next low four bits of vector zmm1 into zmm5.
                          "vpaddd\t%%zmm1,%%zmm8,%%zmm8%{%%k2%}\n\t"        // turn on bit 5 for all values that passed the prior test.