initial iq4_xs
This commit is contained in:
parent
10ceba354a
commit
0fd5a1bb58
1 changed files with 56 additions and 0 deletions
|
@ -10669,6 +10669,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|||
}
|
||||
|
||||
|
||||
#if defined(__AVX__)
|
||||
static inline __m128i mul_add_epi8(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
||||
const __m256i ax = _mm256_sign_epi8(x, x);
|
||||
|
@ -11453,6 +11461,54 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|||
|
||||
*s = hsum_float_8(accum);
|
||||
|
||||
#elif defined __AVX__
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
uint16_t sh = x[ibl].scales_h;
|
||||
__m128i sumi1_0 = _mm_setzero_si128();
|
||||
__m128i sumi1_1 = _mm_setzero_si128();
|
||||
__m128i sumi2_0 = _mm_setzero_si128();
|
||||
__m128i sumi2_1 = _mm_setzero_si128();
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
||||
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
||||
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||
const __m128i p16_1_0 = mul_add_epi8(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8(q4b_2_1, q8b_2_1);
|
||||
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
||||
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
||||
sh >>= 4;
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
|
||||
sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
|
||||
sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
|
||||
sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
|
||||
sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
|
||||
}
|
||||
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
||||
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(accum);
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue