Re-use existing bytesFromNibbles function

2023-03-20 15:55:42 -07:00 · 2023-03-20 15:55:42 -07:00 · 76af3f5a64
commit 76af3f5a64
parent a6598801ad
1 changed files with 2 additions and 17 deletions
--- a/ggml.c
+++ b/ggml.c
@ -361,7 +361,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);

 // AVX routines provided by GH user Const-me
 // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
-#if __AVX2__
+#if __AVX2__ || __AVX512F__
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytesFromNibbles( const uint8_t* rsi )
@ -1418,21 +1418,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 #error "not implemented for QK"
 #endif
 #elif defined(__AVX512F__)
-inline __m256i bytesFromNibbles( const uint8_t* rsi ){
-    // Load 16 bytes from memory
-    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    __m256i high = _mm256_andnot_si256( lowMask, bytes );
-    __m256i low = _mm256_and_si256( lowMask, bytes );
-    high = _mm256_slli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    return bytes;
-}

 inline __m512 process_one_block(
    __m512 acc,