cosmopolitan/third_party/ggml/ggjt.v1.internal.h

#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_
#include "libc/str/str.h"
#include "third_party/intel/immintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
// clang-format off

#ifdef __AVX__
// horizontally add 8 floats
static inline float hsum_float_8(const __m256 x) {
    __m128 res = _mm256_extractf128_ps(x, 1);
    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
    res = _mm_add_ss(res, _mm_movehdup_ps(res));
    return _mm_cvtss_f32(res);
}
#endif /* AVX */

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
// Unpack 16 4-bit fields into 16 bytes
// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) {
    // Load 8 bytes from memory
    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
    // Expand bytes into uint16_t values
    __m128i bytes = _mm_cvtepu8_epi16( tmp );
    // Unpack values into individual bytes
    const __m128i lowMask = _mm_set1_epi8( 0xF );
    __m128i high = _mm_andnot_si128( lowMask, bytes );
    __m128i low = _mm_and_si128( lowMask, bytes );
    high = _mm_slli_epi16( high, 4 );
    bytes = _mm_or_si128( low, high );
    return bytes;
}
#endif /* AVX || AVX2 || AVX512 */

#if defined(__AVX2__) || defined(__AVX512F__)
// spread 32 bits to 32 bytes { 0x00, 0xFF }
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
    uint32_t x32;
    memcpy(&x32, x, sizeof(uint32_t));
    const __m256i shuf_mask = _mm256_set_epi64x(
        0x0303030303030303, 0x0202020202020202,
        0x0101010101010101, 0x0000000000000000);
    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
    bytes = _mm256_or_si256(bytes, bit_mask);
    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
}
#endif /* AVX2 || AVX512 */

#if defined(__AVX2__) || defined(__AVX512F__)

// add int16_t pairwise and return as float vector
static inline __m256 sum_i16_pairs_float(const __m256i x) {
    const __m256i ones = _mm256_set1_epi16(1);
    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
    return _mm256_cvtepi32_ps(summed_pairs);
}

// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
    // Load 16 bytes from memory
    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
    // Expand bytes into uint16_t values
    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
    // Unpack values into individual bytes
    const __m256i lowMask = _mm256_set1_epi8( 0xF );
    __m256i high = _mm256_andnot_si256( lowMask, bytes );
    __m256i low = _mm256_and_si256( lowMask, bytes );
    high = _mm256_slli_epi16( high, 4 );
    bytes = _mm256_or_si256( low, high );
    return bytes;
}

// multiply int8_t, add results pairwise twice and return as float vector
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
    // Get absolute values of x vectors
    const __m256i ax = _mm256_sign_epi8(x, x);
    // Sign the values of the y vectors
    const __m256i sy = _mm256_sign_epi8(y, x);
#ifdef __AVXVNNI__
    const __m256i zero = _mm256_setzero_si256();
    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
    return _mm256_cvtepi32_ps(summed_pairs);
#else
    // Perform multiplication and create 16-bit values
    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
    return sum_i16_pairs_float(dot);
#endif
}

static inline __m128i packNibbles( __m256i bytes ) {
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
#if defined(__AVX512F__)
    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
#else
    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
    __m256i high = _mm256_andnot_si256( lowByte, bytes );
    __m256i low = _mm256_and_si256( lowByte, bytes );
    high = _mm256_srli_epi16( high, 4 );
    bytes = _mm256_or_si256( low, high );
    // Compress uint16_t lanes into bytes
    __m128i r0 = _mm256_castsi256_si128( bytes );
    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
    return _mm_packus_epi16( r0, r1 );
#endif
}

#elif defined(__AVX__)

static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) {
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
    const __m128i lowByte = _mm_set1_epi16( 0xFF );
    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
    __m128i low = _mm_and_si128( lowByte, bytes1 );
    high = _mm_srli_epi16( high, 4 );
    bytes1 = _mm_or_si128( low, high );
    high = _mm_andnot_si128( lowByte, bytes2 );
    low = _mm_and_si128( lowByte, bytes2 );
    high = _mm_srli_epi16( high, 4 );
    bytes2 = _mm_or_si128( low, high );
    return _mm_packus_epi16( bytes1, bytes2);
}

#endif /* __AVX__ */

COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_ */
Add support for new GGJT v2 quantizers This change makes quantized models (e.g. q4_0) go 10% faster on Macs however doesn't offer much improvement for Intel PC hardware. This change syncs llama.cpp 699b1ad7fe6f7b9e41d3cb41e61a8cc3ea5fc6b5 which recently made a breaking change to nearly all its file formats without any migration. Since that'll break hundreds upon hundreds of models on websites like HuggingFace llama.com will support both file formats because llama.com will never ever break the GGJT file format 2023-05-13 15:08:32 +00:00			`#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_`
			`#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_`
			`#include "libc/str/str.h"`
			`#include "third_party/intel/immintrin.internal.h"`
			`#if !(__ASSEMBLER__ + __LINKER__ + 0)`
			`COSMOPOLITAN_C_START_`
			`// clang-format off`

			`#ifdef __AVX__`
			`// horizontally add 8 floats`
			`static inline float hsum_float_8(const __m256 x) {`
			`__m128 res = _mm256_extractf128_ps(x, 1);`
			`res = _mm_add_ps(res, _mm256_castps256_ps128(x));`
			`res = _mm_add_ps(res, _mm_movehl_ps(res, res));`
			`res = _mm_add_ss(res, _mm_movehdup_ps(res));`
			`return _mm_cvtss_f32(res);`
			`}`
			`#endif /* AVX */`

			`#if defined(__AVX__) \|\| defined(__AVX2__) \|\| defined(__AVX512F__)`
			`// Unpack 16 4-bit fields into 16 bytes`
			`// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval`
			`static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) {`
			`// Load 8 bytes from memory`
			`__m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );`
			`// Expand bytes into uint16_t values`
			`__m128i bytes = _mm_cvtepu8_epi16( tmp );`
			`// Unpack values into individual bytes`
			`const __m128i lowMask = _mm_set1_epi8( 0xF );`
			`__m128i high = _mm_andnot_si128( lowMask, bytes );`
			`__m128i low = _mm_and_si128( lowMask, bytes );`
			`high = _mm_slli_epi16( high, 4 );`
			`bytes = _mm_or_si128( low, high );`
			`return bytes;`
			`}`
			`#endif /* AVX \|\| AVX2 \|\| AVX512 */`

			`#if defined(__AVX2__) \|\| defined(__AVX512F__)`
			`// spread 32 bits to 32 bytes { 0x00, 0xFF }`
			`static inline __m256i bytes_from_bits_32(const uint8_t * x) {`
			`uint32_t x32;`
			`memcpy(&x32, x, sizeof(uint32_t));`
			`const __m256i shuf_mask = _mm256_set_epi64x(`
			`0x0303030303030303, 0x0202020202020202,`
			`0x0101010101010101, 0x0000000000000000);`
			`__m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);`
			`const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);`
			`bytes = _mm256_or_si256(bytes, bit_mask);`
			`return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));`
			`}`
			`#endif /* AVX2 \|\| AVX512 */`

			`#if defined(__AVX2__) \|\| defined(__AVX512F__)`

			`// add int16_t pairwise and return as float vector`
			`static inline __m256 sum_i16_pairs_float(const __m256i x) {`
			`const __m256i ones = _mm256_set1_epi16(1);`
			`const __m256i summed_pairs = _mm256_madd_epi16(ones, x);`
			`return _mm256_cvtepi32_ps(summed_pairs);`
			`}`

			`// Unpack 32 4-bit fields into 32 bytes`
			`// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval`
			`static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {`
			`// Load 16 bytes from memory`
			`__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );`
			`// Expand bytes into uint16_t values`
			`__m256i bytes = _mm256_cvtepu8_epi16( tmp );`
			`// Unpack values into individual bytes`
			`const __m256i lowMask = _mm256_set1_epi8( 0xF );`
			`__m256i high = _mm256_andnot_si256( lowMask, bytes );`
			`__m256i low = _mm256_and_si256( lowMask, bytes );`
			`high = _mm256_slli_epi16( high, 4 );`
			`bytes = _mm256_or_si256( low, high );`
			`return bytes;`
			`}`

			`// multiply int8_t, add results pairwise twice and return as float vector`
			`static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {`
			`// Get absolute values of x vectors`
			`const __m256i ax = _mm256_sign_epi8(x, x);`
			`// Sign the values of the y vectors`
			`const __m256i sy = _mm256_sign_epi8(y, x);`
			`#ifdef __AVXVNNI__`
			`const __m256i zero = _mm256_setzero_si256();`
			`const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);`
			`return _mm256_cvtepi32_ps(summed_pairs);`
			`#else`
			`// Perform multiplication and create 16-bit values`
			`const __m256i dot = _mm256_maddubs_epi16(ax, sy);`
			`return sum_i16_pairs_float(dot);`
			`#endif`
			`}`

			`static inline __m128i packNibbles( __m256i bytes ) {`
			`// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh`
			`#if defined(__AVX512F__)`
			`const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000`
			`bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh`
			`return _mm256_cvtepi16_epi8(bytes); // abcd_efgh`
			`#else`
			`const __m256i lowByte = _mm256_set1_epi16( 0xFF );`
			`__m256i high = _mm256_andnot_si256( lowByte, bytes );`
			`__m256i low = _mm256_and_si256( lowByte, bytes );`
			`high = _mm256_srli_epi16( high, 4 );`
			`bytes = _mm256_or_si256( low, high );`
			`// Compress uint16_t lanes into bytes`
			`__m128i r0 = _mm256_castsi256_si128( bytes );`
			`__m128i r1 = _mm256_extracti128_si256( bytes, 1 );`
			`return _mm_packus_epi16( r0, r1 );`
			`#endif`
			`}`

			`#elif defined(__AVX__)`

			`static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) {`
			`// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh`
			`const __m128i lowByte = _mm_set1_epi16( 0xFF );`
			`__m128i high = _mm_andnot_si128( lowByte, bytes1 );`
			`__m128i low = _mm_and_si128( lowByte, bytes1 );`
			`high = _mm_srli_epi16( high, 4 );`
			`bytes1 = _mm_or_si128( low, high );`
			`high = _mm_andnot_si128( lowByte, bytes2 );`
			`low = _mm_and_si128( lowByte, bytes2 );`
			`high = _mm_srli_epi16( high, 4 );`
			`bytes2 = _mm_or_si128( low, high );`
			`return _mm_packus_epi16( bytes1, bytes2);`
			`}`

			`#endif /* __AVX__ */`

			`COSMOPOLITAN_C_END_`
			`#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */`
			`#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_ */`