ggml: add AVX support

2023-05-05 23:39:07 +09:00 · 2023-05-05 23:39:07 +09:00 · 60196ae73d
commit 60196ae73d
parent 799fdc1b5d
1 changed files with 356 additions and 20 deletions
--- a/ggml.c
+++ b/ggml.c
@ -588,6 +588,93 @@ static inline __m128i packNibbles( __m256i bytes )
 #endif
 }
 #else
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+    const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+    __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+    __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+    const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytesl = _mm_or_si128(bytesl, bit_mask);
+    bytesh = _mm_or_si128(bytesh, bit_mask);
+    bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+    bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+    return _mm256_set_m128i(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+    // Load 16 bytes from memory
+    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
+
+    // Expand bytes into uint16_t values
+    __m128i bytesl = _mm_cvtepu8_epi16( tmp );
+    __m128i bytesh = _mm_unpackhi_epi8( tmp, _mm_setzero_si128() );
+
+    // Unpack values into individual bytes
+    const __m128i lowMask = _mm_set1_epi8( 0xF );
+    __m128i highl = _mm_andnot_si128( lowMask, bytesl );
+    __m128i highh = _mm_andnot_si128( lowMask, bytesh );
+    const __m128i lowl = _mm_and_si128( lowMask, bytesl );
+    const __m128i lowh = _mm_and_si128( lowMask, bytesh );
+    highl = _mm_slli_epi16( highl, 4 );
+    highh = _mm_slli_epi16( highh, 4 );
+    bytesl = _mm_or_si128( lowl, highl );
+    bytesh = _mm_or_si128( lowh, highh );
+    __m256i bytes = _mm256_set_m128i(bytesh, bytesl);
+    return bytes;
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+    const __m128i ones = _mm_set1_epi16(1);
+    const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+    const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+    const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    const __m128i xl = _mm256_castsi256_si128(x);
+    const __m128i xh = _mm256_extractf128_si256(x, 1);
+    const __m128i yl = _mm256_castsi256_si128(y);
+    const __m128i yh = _mm256_extractf128_si256(y, 1);
+    // Get absolute values of x vectors
+    const __m128i axl = _mm_sign_epi8(xl, xl);
+    const __m128i axh = _mm_sign_epi8(xh, xh);
+    // Sign the values of the y vectors
+    const __m128i syl = _mm_sign_epi8(yl, xl);
+    const __m128i syh = _mm_sign_epi8(yh, xh);
+    // Perform multiplication and create 16-bit values
+    const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+    const __m128i doth = _mm_maddubs_epi16(axh, syh);
+    return sum_i16_pairs_float(doth, dotl);
+}
+
+static inline __m128i packNibbles_256( __m256i bytes )
+{
+    __m128i bytesl = _mm256_castsi256_si128(bytes);
+    __m128i bytesh = _mm256_extractf128_si256(bytes, 1);
+    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+    const __m128i lowByte = _mm_set1_epi16( 0xFF );
+    __m128i highl = _mm_andnot_si128( lowByte, bytesl );
+    __m128i highh = _mm_andnot_si128( lowByte, bytesh );
+    const __m128i lowl = _mm_and_si128( lowByte, bytesl );
+    const __m128i lowh = _mm_and_si128( lowByte, bytesh );
+    highl = _mm_srli_epi16( highl, 4 );
+    highh = _mm_srli_epi16( highh, 4 );
+    bytesl = _mm_or_si128( lowl, highl );
+    bytesh = _mm_or_si128( lowh, highh );
+
+    // Compress uint16_t lanes into bytes
+    return _mm_packus_epi16( bytesl, bytesh );
+}
+
 static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 {
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
@ -1096,23 +1183,23 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int

        // Since we don't have in AVX some necessary functions,
        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+        __m128i ni0 = _mm256_castsi256_si128( i0 );     // 0,1,2,3
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1); // 4,5,6,7
+        __m128i ni2 = _mm256_castsi256_si128( i1 );     // 8,9,10,11
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1); // 12,13,14,15
+        __m128i ni4 = _mm256_castsi256_si128( i2 );     // 16,17,18,19
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1); // 20,21,22,23
+        __m128i ni6 = _mm256_castsi256_si128( i3 );     // 24,25,26,27
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1); // 28,29,30,31

        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
+        ni0 = _mm_packs_epi32( ni0, ni1 );  // 0,1,2,3, 4,5,6,7
+        ni2 = _mm_packs_epi32( ni2, ni3 );  // 8,9,10,11, 12,13,14,15
+        ni4 = _mm_packs_epi32( ni4, ni5 );  // 16,17,18,19, 20,21,22,23
+        ni6 = _mm_packs_epi32( ni6, ni7 );  // 24,25,26,27, 28,29,30,31
        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
+        ni0 = _mm_packs_epi16( ni0, ni2 );  // 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
+        ni4 = _mm_packs_epi16( ni4, ni6 );  // 16,17,18,19, 20,21,22,23, 24,25,26,27, 28,29,30,31

        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
        const __m128i off = _mm_set1_epi8( 8 );
@ -1222,7 +1309,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int

    block_q4_1 * restrict y = vy;

-#if defined(__AVX2__)
+#if defined(__AVX2__) || defined(__AVX__)
    for (int i = 0; i < nb; i++) {
        // Load elements into 4 AVX vectors
        __m256 v0 = _mm256_loadu_ps( x );
@ -1280,10 +1367,11 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
        __m256i i2 = _mm256_cvtps_epi32( v2 );
        __m256i i3 = _mm256_cvtps_epi32( v3 );

+#if defined(__AVX2__)
        // Convert int32 to int16
        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
+        // Convert int16 to int8
        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31

        // We got our precious signed bytes, but the order is now wrong
@ -1294,6 +1382,30 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int

        // Compress the vector into 4 bit/value, and store
        __m128i res = packNibbles( i0 );
+#elif defined(__AVX__)
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );     // 0,1,2,3
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1); // 4,5,6,7
+        __m128i ni2 = _mm256_castsi256_si128( i1 );     // 8,9,10,11
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1); // 12,13,14,15
+        __m128i ni4 = _mm256_castsi256_si128( i2 );     // 16,17,18,19
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1); // 20,21,22,23
+        __m128i ni6 = _mm256_castsi256_si128( i3 );     // 24,25,26,27
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1); // 28,29,30,31
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );  // 0,1,2,3, 4,5,6,7
+        ni2 = _mm_packs_epi32( ni2, ni3 );  // 8,9,10,11, 12,13,14,15
+        ni4 = _mm_packs_epi32( ni4, ni5 );  // 16,17,18,19, 20,21,22,23
+        ni6 = _mm_packs_epi32( ni6, ni7 );  // 24,25,26,27, 28,29,30,31
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );  // 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15
+        ni4 = _mm_packs_epi16( ni4, ni6 );  // 16,17,18,19, 20,21,22,23, 24,25,26,27, 28,29,30,31
+
+        // Compress the vector into 4 bit/value, and store
+        __m128i res = packNibbles( ni0, ni4 );
+#endif
        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
    }
 #elif __ARM_NEON
@ -1874,6 +1986,48 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
            }
        }
    }
+#elif defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // scale factor
+        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
+
+        const uint8_t * restrict pp = x[i].qs;
+
+        for (int l = 0; l < QK4_0; l += 32) {
+            // Load 32x4-bit integers into 32x8-bit integers
+            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
+            __m128i vx8_lo = _mm256_castsi256_si128(vx8);
+            __m128i vx8_hi = _mm256_extractf128_si256(vx8, 1);
+
+            // Subtract 8 from the integers
+            vx8_lo = _mm_sub_epi8(vx8_lo, _mm_set1_epi8(8));
+            vx8_hi = _mm_sub_epi8(vx8_hi, _mm_set1_epi8(8));
+
+            // Convert to 16-bit int
+            __m128i vx16_lol = _mm_cvtepi8_epi16(vx8_lo);
+            __m128i vx16_loh = _mm_cvtepi8_epi16(_mm_srli_si128(vx8_lo, 8));
+            __m128i vx16_hil = _mm_cvtepi8_epi16(vx8_hi);
+            __m128i vx16_hih = _mm_cvtepi8_epi16(_mm_srli_si128(vx8_hi, 8));
+
+            // Convert to 32-bit int -> float 32
+            const __m256 vf[4] = {
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_lol, 8)), _mm_cvtepi16_epi32(vx16_lol))),
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_loh, 8)), _mm_cvtepi16_epi32(vx16_loh))),
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_hil, 8)), _mm_cvtepi16_epi32(vx16_hil))),
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_hih, 8)), _mm_cvtepi16_epi32(vx16_hih)))
+            };
+
+            // Scale and store
+            __m256 result = _mm256_mul_ps(vf[0], d_v);
+            _mm256_storeu_ps(y + i * QK4_0 + l + 0, result);
+            result = _mm256_mul_ps(vf[1], d_v);
+            _mm256_storeu_ps(y + i * QK4_0 + l + 8, result);
+            result = _mm256_mul_ps(vf[2], d_v);
+            _mm256_storeu_ps(y + i * QK4_0 + l + 16, result);
+            result = _mm256_mul_ps(vf[3], d_v);
+            _mm256_storeu_ps(y + i * QK4_0 + l + 24, result);
+        }
+    }
 #elif defined(__ARM_NEON)
    for (int i = 0; i < nb; i++) {
        const float32x4_t vd = vdupq_n_f32(x[i].d);
@ -1983,10 +2137,52 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
            };

            // Scale, add m and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
-                _mm256_storeu_ps(y + i * QK4_1 + l + j*8, result);
-            }
+            __m256 result = _mm256_fmadd_ps(vf[0], d_v, d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 0, result);
+            result = _mm256_fmadd_ps(vf[1], d_v, d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 8, result);
+            result = _mm256_fmadd_ps(vf[2], d_v, d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 16, result);
+            result = _mm256_fmadd_ps(vf[3], d_v, d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 24, result);
+        }
+    }
+#elif defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
+        const __m256 d_m = _mm256_broadcast_ss(&x[i].m);
+
+        const uint8_t * restrict pp = x[i].qs;
+
+        for (int l = 0; l < QK4_1; l += 32) {
+            // Load 32x4-bit integers into 32x8-bit integers
+            const __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
+            const __m128i vx8_lo = _mm256_castsi256_si128(vx8);
+            const __m128i vx8_hi = _mm256_extractf128_si256(vx8, 1);
+
+            // Convert to 16-bit int
+            const __m128i vx16_lol = _mm_cvtepi8_epi16(vx8_lo);
+            const __m128i vx16_loh = _mm_cvtepi8_epi16(_mm_srli_si128(vx8_lo, 8));
+            const __m128i vx16_hil = _mm_cvtepi8_epi16(vx8_hi);
+            const __m128i vx16_hih = _mm_cvtepi8_epi16(_mm_srli_si128(vx8_hi, 8));
+
+            // Convert to 32-bit int -> float 32
+            const __m256 vf[4] = {
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_lol, 8)), _mm_cvtepi16_epi32(vx16_lol))),
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_loh, 8)), _mm_cvtepi16_epi32(vx16_loh))),
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_hil, 8)), _mm_cvtepi16_epi32(vx16_hil))),
+                _mm256_cvtepi32_ps(_mm256_set_m128i(_mm_cvtepi16_epi32(_mm_srli_si128(vx16_hih, 8)), _mm_cvtepi16_epi32(vx16_hih)))
+            };
+
+            // Scale, add m and store
+            __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[0], d_v), d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 0, result);
+            result = _mm256_add_ps(_mm256_mul_ps(vf[1], d_v), d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 8, result);
+            result = _mm256_add_ps(_mm256_mul_ps(vf[2], d_v), d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 16, result);
+            result = _mm256_add_ps(_mm256_mul_ps(vf[3], d_v), d_m);
+            _mm256_storeu_ps(y + i * QK4_1 + l + 24, result);
        }
    }
 #elif defined(__ARM_NEON)
@ -3117,6 +3313,36 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
        acc = _mm256_fmadd_ps( d0d1, xy, acc );
    }

+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float * d0 = &x[i].d;
+        const float * d1 = &y[i].d;
+
+        summs += x[i].m * (y[i].s0 + y[i].s1);
+
+        const __m256 d0v = _mm256_broadcast_ss( d0 );
+        const __m256 d1v = _mm256_broadcast_ss( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
+
+        const __m256 xy = mul_sum_i8_pairs_float(bx, by);
+
+        // Accumulate d0*d1*x*y
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+    }
+
    *s = hsum_float_8(acc) + summs;
 #else
    // scalar
@ -3261,6 +3487,35 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
        acc = _mm256_fmadd_ps(d, q, acc);
    }

+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
+        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
+        const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
+
+        __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
+        __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m128i off = _mm_set1_epi8(8);
+        bx0 = _mm_sub_epi8(bx0, off);
+        bx1 = _mm_sub_epi8(bx1, off);
+        __m256i bx = _mm256_set_m128i(bx1, bx0);
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
    *s = hsum_float_8(acc);
 #else
    // scalar
@ -3463,6 +3718,36 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
        acc = _mm256_fmadd_ps(d, q, acc);
    }

+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, _mm_set1_epi8((char)0xF0));
+        bxhih = _mm_andnot_si128(bxhih, _mm_set1_epi8((char)0xF0));
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = _mm256_set_m128i(bxh, bxl);
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
    *s = hsum_float_8(acc);
 #else
    // scalar
@ -3667,6 +3952,39 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
    }

+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // change to AVX
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, _mm_set1_epi8(0x10));
+        bxhih = _mm_and_si128(bxhih, _mm_set1_epi8(0x10));
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = _mm256_set_m128i(bxh, bxl);
+
+        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
    *s = hsum_float_8(acc) + summs;
 #else
    float sumf = 0.0;
@ -3784,6 +4102,24 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
        acc = _mm256_fmadd_ps( d, q, acc );
    }

+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+    }
+
    *s = hsum_float_8(acc);
 #else
    // scalar