minor
This commit is contained in:
parent
afe94e878b
commit
5bb5327833
1 changed files with 45 additions and 2 deletions
47
ggml.c
47
ggml.c
|
@ -434,7 +434,51 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|||
// quantization
|
||||
//
|
||||
|
||||
#define QK 32
|
||||
#if __AVX__ || __AVX2__ || __AVX512F__
|
||||
// Unpack 16 4-bit fields into 16 bytes
|
||||
// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
|
||||
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
|
||||
{
|
||||
// Load 8 bytes from memory
|
||||
__m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
|
||||
|
||||
// Expand bytes into uint16_t values
|
||||
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
||||
|
||||
// Unpack values into individual bytes
|
||||
const __m128i lowMask = _mm_set1_epi8( 0xF );
|
||||
__m128i high = _mm_andnot_si128( lowMask, bytes );
|
||||
__m128i low = _mm_and_si128( lowMask, bytes );
|
||||
high = _mm_slli_epi16( high, 4 );
|
||||
bytes = _mm_or_si128( low, high );
|
||||
return bytes;
|
||||
}
|
||||
|
||||
// horizontally add 8 floats
|
||||
static inline float hsum_float_8(const __m256 x) {
|
||||
__m128 res = _mm256_extractf128_ps(x, 1);
|
||||
res = _mm_add_ps(res, _mm256_castps256_ps128(x));
|
||||
res = _mm_add_ps(res, _mm_movehl_ps(res, res));
|
||||
res = _mm_add_ss(res, _mm_movehdup_ps(res));
|
||||
return _mm_cvtss_f32(res);
|
||||
}
|
||||
|
||||
// horizontally add 8 int32_t
|
||||
static inline int hsum_i32_8(const __m256i a) {
|
||||
const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
|
||||
const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
|
||||
const __m128i sum64 = _mm_add_epi32(hi64, sum128);
|
||||
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
||||
}
|
||||
|
||||
// horizontally add 4 int32_t
|
||||
static inline int hsum_i32_4(const __m128i a) {
|
||||
const __m128i hi64 = _mm_unpackhi_epi64(a, a);
|
||||
const __m128i sum64 = _mm_add_epi32(hi64, a);
|
||||
const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
||||
}
|
||||
|
||||
// AVX routine provided by GH user jon-chuang
|
||||
#if __AVX2__ || __AVX512F__
|
||||
|
@ -509,7 +553,6 @@ void ggml_mul_row_f32_tall_skinny(const float * A, const float * B, float * C, i
|
|||
_mm_maskstore_ps(&C[j], mask_vec, c_vec);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// AVX routines provided by GH user Const-me
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue