From dbd6c204b4f2078aecc07562480fe8f9a824f8d0 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Wed, 10 May 2023 11:28:36 +0200
Subject: [PATCH] AVX2 implementation of Q8 quantization

---
 ggml.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/ggml.c b/ggml.c
index 26e162efa..13d06ad74 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1415,7 +1415,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
             y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
         }
     }
-#elif 0 //defined(__AVX2__) || defined(__AVX__) TODO
+#elif defined(__AVX2__) // || defined(__AVX__) TODO
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1468,10 +1468,11 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+        // TODO: find a smarter way to do this
+        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
+        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
+        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
+        i0 = _mm256_or_si256(i1, i2);
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
@@ -1604,7 +1605,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         y[i].s0 = d * sum0;
         y[i].s1 = d * sum1;
     }
-#elif 0//defined(__AVX2__) || defined(__AVX__) TODO
+#elif defined(__AVX2__) // || defined(__AVX__) TODO
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1662,10 +1663,11 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+        // TODO: find a smarter way to do this
+        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
+        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
+        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
+        i0 = _mm256_or_si256(i1, i2);
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else