iq2xs: faster AVX2 dot product

This commit is contained in:
Iwan Kawrakow 2024-01-29 07:21:30 +02:00
parent b764b8f1d0
commit 76f7befaa1

View file

@ -8458,17 +8458,40 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
const __m128i m4 = _mm_set1_epi8(0xf);
const __m128i m1 = _mm_set1_epi8(1);
const __m128i m511 = _mm_set1_epi16(511);
const __m128i m127 = _mm_set1_epi16(127);
const __m256i m511 = _mm256_set1_epi16(511);
const __m256i m127 = _mm256_set1_epi16(127);
const __m256i mxf = _mm256_set1_epi16(0xf);
const __m256i mone16 = _mm256_set1_epi16(1);
const __m256i mone8 = _mm256_set1_epi8(1);
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
static const uint8_t k_bit_counts[32] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
};
static const char block_sign_shuffle_mask_bytes[64] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
};
static const uint8_t bit_selector_mask_bytes[32] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
};
const __m256i bit_counts_table = _mm256_loadu_si256((const __m256i*)k_bit_counts);
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
const __m256i block1_sign_shuffle = _mm256_loadu_si256((const __m256i*)(block_sign_shuffle_mask_bytes+ 0));
const __m256i block2_sign_shuffle = _mm256_loadu_si256((const __m256i*)(block_sign_shuffle_mask_bytes+32));
//const __m256i block1_sign_shuffle = _mm256_set_epi64x(0x0606060606060606, 0x0404040404040404, 0x0202020202020202, 0x0000000000000000);
//const __m256i block2_sign_shuffle = _mm256_set_epi64x(0x0e0e0e0e0e0e0e0e, 0x0c0c0c0c0c0c0c0c, 0x0a0a0a0a0a0a0a0a, 0x0808080808080808);
uint64_t aux64;
// somewhat hacky, but gives a significant boost in performance
__m128i aux_gindex, aux_sindex;
__m256i aux_gindex;
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
const uint16_t * sindex = (const uint16_t *)&aux_sindex;
__m256 accumf = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
@ -8483,26 +8506,62 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
__m256i sumi1 = _mm256_setzero_si256();
__m256i sumi2 = _mm256_setzero_si256();
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16;
aux_gindex = _mm256_and_si256(q2_data, m511);
const __m256i partial_sign_bits = _mm256_and_si256(_mm256_srli_epi16(q2_data, 9), m127);
const __m256i bit_counts1 = _mm256_shuffle_epi8(bit_counts_table, _mm256_and_si256(partial_sign_bits, mxf));
const __m256i bit_counts2 = _mm256_shuffle_epi8(bit_counts_table, _mm256_and_si256(_mm256_srli_epi16(partial_sign_bits, 4), mxf));
const __m256i bit_counts = _mm256_add_epi8(bit_counts1, bit_counts2);
const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, _mm256_slli_epi16(_mm256_and_si256(bit_counts, mone16), 7));
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8;
aux_gindex = _mm_and_si128(q2_data, m511);
aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
__m256i signs = _mm256_shuffle_epi8(full_signs_1, block1_sign_shuffle);
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone8));
signs = _mm256_shuffle_epi8(full_signs_1, block2_sign_shuffle);
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone8));
signs = _mm256_shuffle_epi8(full_signs_2, block1_sign_shuffle);
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone8));
signs = _mm256_shuffle_epi8(full_signs_2, block2_sign_shuffle);
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone8));
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3);
const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4);
const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
}
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);