#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_ #define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_ #include "libc/str/str.h" #include "third_party/intel/immintrin.internal.h" #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ // clang-format off #ifdef __AVX__ // horizontally add 8 floats static inline float hsum_float_8(const __m256 x) { __m128 res = _mm256_extractf128_ps(x, 1); res = _mm_add_ps(res, _mm256_castps256_ps128(x)); res = _mm_add_ps(res, _mm_movehl_ps(res, res)); res = _mm_add_ss(res, _mm_movehdup_ps(res)); return _mm_cvtss_f32(res); } #endif /* AVX */ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) // Unpack 16 4-bit fields into 16 bytes // The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) { // Load 8 bytes from memory __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi ); // Expand bytes into uint16_t values __m128i bytes = _mm_cvtepu8_epi16( tmp ); // Unpack values into individual bytes const __m128i lowMask = _mm_set1_epi8( 0xF ); __m128i high = _mm_andnot_si128( lowMask, bytes ); __m128i low = _mm_and_si128( lowMask, bytes ); high = _mm_slli_epi16( high, 4 ); bytes = _mm_or_si128( low, high ); return bytes; } #endif /* AVX || AVX2 || AVX512 */ #if defined(__AVX2__) || defined(__AVX512F__) // spread 32 bits to 32 bytes { 0x00, 0xFF } static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; memcpy(&x32, x, sizeof(uint32_t)); const __m256i shuf_mask = _mm256_set_epi64x( 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); bytes = _mm256_or_si256(bytes, bit_mask); return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); } #endif /* AVX2 || AVX512 */ #if defined(__AVX2__) || defined(__AVX512F__) // add int16_t pairwise and return as float vector static inline __m256 sum_i16_pairs_float(const __m256i x) { const __m256i ones = _mm256_set1_epi16(1); const __m256i summed_pairs = _mm256_madd_epi16(ones, x); return _mm256_cvtepi32_ps(summed_pairs); } // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { // Load 16 bytes from memory __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); // Expand bytes into uint16_t values __m256i bytes = _mm256_cvtepu8_epi16( tmp ); // Unpack values into individual bytes const __m256i lowMask = _mm256_set1_epi8( 0xF ); __m256i high = _mm256_andnot_si256( lowMask, bytes ); __m256i low = _mm256_and_si256( lowMask, bytes ); high = _mm256_slli_epi16( high, 4 ); bytes = _mm256_or_si256( low, high ); return bytes; } // multiply int8_t, add results pairwise twice and return as float vector static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { // Get absolute values of x vectors const __m256i ax = _mm256_sign_epi8(x, x); // Sign the values of the y vectors const __m256i sy = _mm256_sign_epi8(y, x); #ifdef __AVXVNNI__ const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); return _mm256_cvtepi32_ps(summed_pairs); #else // Perform multiplication and create 16-bit values const __m256i dot = _mm256_maddubs_epi16(ax, sy); return sum_i16_pairs_float(dot); #endif } static inline __m128i packNibbles( __m256i bytes ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh #if defined(__AVX512F__) const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh return _mm256_cvtepi16_epi8(bytes); // abcd_efgh #else const __m256i lowByte = _mm256_set1_epi16( 0xFF ); __m256i high = _mm256_andnot_si256( lowByte, bytes ); __m256i low = _mm256_and_si256( lowByte, bytes ); high = _mm256_srli_epi16( high, 4 ); bytes = _mm256_or_si256( low, high ); // Compress uint16_t lanes into bytes __m128i r0 = _mm256_castsi256_si128( bytes ); __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); return _mm_packus_epi16( r0, r1 ); #endif } #elif defined(__AVX__) static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh const __m128i lowByte = _mm_set1_epi16( 0xFF ); __m128i high = _mm_andnot_si128( lowByte, bytes1 ); __m128i low = _mm_and_si128( lowByte, bytes1 ); high = _mm_srli_epi16( high, 4 ); bytes1 = _mm_or_si128( low, high ); high = _mm_andnot_si128( lowByte, bytes2 ); low = _mm_and_si128( lowByte, bytes2 ); high = _mm_srli_epi16( high, 4 ); bytes2 = _mm_or_si128( low, high ); return _mm_packus_epi16( bytes1, bytes2); } #endif /* __AVX__ */ COSMOPOLITAN_C_END_ #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ #endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_ */