diff --git a/expose.cpp b/expose.cpp index b64af3434..173595122 100644 --- a/expose.cpp +++ b/expose.cpp @@ -21,7 +21,7 @@ extern "C" { std::string model = inputs.model_filename; file_format = check_file_format(model.c_str()); - printf("\nIdentified as LLAMA model: (ver %d)\n", file_format); + printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format); return llama_load_model(inputs, file_format); } diff --git a/ggml.c b/ggml.c index 25fa72632..b6dd3f3cf 100644 --- a/ggml.c +++ b/ggml.c @@ -461,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes ) __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); return _mm_packus_epi16( r0, r1 ); } +#elif __AVX__ +static inline __m128i bytesFromNibbles( const uint8_t* rsi ) +{ + // Load 8 bytes from memory + __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi ); + + // Expand bytes into uint16_t values + __m128i bytes = _mm_cvtepu8_epi16( tmp ); + + // Unpack values into individual bytes + const __m128i lowMask = _mm_set1_epi8( 0xF ); + __m128i high = _mm_andnot_si128( lowMask, bytes ); + __m128i low = _mm_and_si128( lowMask, bytes ); + high = _mm_slli_epi16( high, 4 ); + bytes = _mm_or_si128( low, high ); + return bytes; +} + +static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) +{ + // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh + const __m128i lowByte = _mm_set1_epi16( 0xFF ); + __m128i high = _mm_andnot_si128( lowByte, bytes1 ); + __m128i low = _mm_and_si128( lowByte, bytes1 ); + high = _mm_srli_epi16( high, 4 ); + bytes1 = _mm_or_si128( low, high ); + high = _mm_andnot_si128( lowByte, bytes2 ); + low = _mm_and_si128( lowByte, bytes2 ); + high = _mm_srli_epi16( high, 4 ); + bytes2 = _mm_or_si128( low, high ); + + return _mm_packus_epi16( bytes1, bytes2); +} #endif // method 5 @@ -509,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r const uint8_t vi0 = (int8_t)roundf(v0) + 8; const uint8_t vi1 = (int8_t)roundf(v1) + 8; - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); + assert(vi0 < 16); + assert(vi1 < 16); pp[l/2] = vi0 | (vi1 << 4); } @@ -660,6 +693,80 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int __m128i res = packNibbles( i0 ); _mm_storeu_si128( ( __m128i* )y[i].qs, res ); } +#elif defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 7.0f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ] + const __m128i off = _mm_set1_epi8( 8); + ni0 = _mm_add_epi8( ni0, off ); + ni4 = _mm_add_epi8( ni4, off ); + + // Compress the vector into 4 bit/value, and store + __m128i res = packNibbles( ni0, ni4 ); + _mm_storeu_si128( ( __m128i* )y[i].qs, res ); + } #elif defined(__wasm_simd128__) for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -730,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric const uint8_t vi0 = roundf(v0); const uint8_t vi1 = roundf(v1); - assert(vi0 >= 0 && vi0 < 16); - assert(vi1 >= 0 && vi1 < 16); + assert(vi0 < 16); + assert(vi1 < 16); pp[l/2] = vi0 | (vi1 << 4); } @@ -1726,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const block_q4_0 * restrict x = vx; const block_q4_0 * restrict y = vy; - ggml_float sumf = 0.0; + float sumf = 0.0; #if defined(__ARM_NEON) float sum0 = 0.0f; @@ -1821,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest #endif } - sumf = (ggml_float)(sum0 + sum1); + sumf = sum0 + sum1; #elif defined(__AVX512F__) // Initialize accumulator with zeros __m512 acc0 = _mm512_setzero_ps(); @@ -1855,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest __m256 acc = _mm256_setzero_ps(); // Main loop + // TODO: figure a way to do this in a portable way + #ifdef __GNUC__ + #pragma GCC unroll 16 + #endif for (int i = 0; i < nb; ++i) { // Compute combined scale for the block const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); @@ -1868,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest bx = _mm256_sub_epi8( bx, off ); by = _mm256_sub_epi8( by, off ); - // Sign-extend first 16 signed bytes into int16_t - __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) ); - __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) ); - // Compute products of int16_t integers, add pairwise - __m256i i32 = _mm256_madd_epi16( x16, y16 ); + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(bx, bx); - // Sign-extend last 16 signed bytes into int16_t vectors - x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) ); - y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) ); - // Accumulate products of int16_t integers - i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) ); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(by, bx); + + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + + const __m256i ones = _mm256_set1_epi16(1); + const __m256i i32 = _mm256_madd_epi16(ones, dot); // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps( i32 ); + const __m256 p = _mm256_cvtepi32_ps( i32 ); + // Apply the scale, and accumulate acc = _mm256_fmadd_ps( d, p, acc ); } @@ -1892,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + sumf = _mm_cvtss_f32( res ); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + + __m128i i32[2]; + for (int j = 0; j < 2; ++j) { + // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes + __m128i bx = bytesFromNibbles( x[i].qs + 8*j ); + __m128i by = bytesFromNibbles( y[i].qs + 8*j ); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m128i off = _mm_set1_epi8( 8 ); + bx = _mm_sub_epi8( bx, off ); + by = _mm_sub_epi8( by, off ); + + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(bx, bx); + + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(by, bx); + + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + + const __m128i ones = _mm_set1_epi16(1); + i32[j] = _mm_madd_epi16(ones, dot); + } + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] )); + // Apply the scale, and accumulate + acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm256_extractf128_ps( acc, 1 ); + res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) ); + res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); + res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); + sumf = _mm_cvtss_f32( res ); #elif defined(__wasm_simd128__) // wasm simd diff --git a/llamacpp.dll b/llamacpp.dll index cfbcb511c..23f4d62a1 100644 Binary files a/llamacpp.dll and b/llamacpp.dll differ diff --git a/llamacpp_blas.dll b/llamacpp_blas.dll index bf59802c8..0d180f202 100644 Binary files a/llamacpp_blas.dll and b/llamacpp_blas.dll differ diff --git a/main.exe b/main.exe index 33e6241c3..22b9eef24 100644 Binary files a/main.exe and b/main.exe differ diff --git a/quantize.exe b/quantize.exe index 2c78ac96f..5d987bf1a 100644 Binary files a/quantize.exe and b/quantize.exe differ