iq1_m
This commit is contained in:
parent
8d1d112a9f
commit
5ff64adfe4
1 changed files with 86 additions and 0 deletions
|
@ -11145,6 +11145,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|||
|
||||
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
||||
|
||||
#elif defined __AVX__
|
||||
const __m128i mask = _mm_set1_epi16(0x7);
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
|
||||
__m128i sumi1_0 = _mm_setzero_si128();
|
||||
__m128i sumi1_1 = _mm_setzero_si128();
|
||||
__m128i sumi2_0 = _mm_setzero_si128();
|
||||
__m128i sumi2_1 = _mm_setzero_si128();
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const __m128i q1b_1_0 = _mm_set_epi64x(
|
||||
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
|
||||
const __m128i q1b_1_1 = _mm_set_epi64x(
|
||||
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
|
||||
const __m128i q1b_2_0 = _mm_set_epi64x(
|
||||
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
|
||||
const __m128i q1b_2_1 = _mm_set_epi64x(
|
||||
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
|
||||
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
|
||||
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
||||
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
||||
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
||||
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
||||
|
||||
const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||
const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||
const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||
const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
||||
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
||||
|
||||
const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
|
||||
const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
|
||||
const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
|
||||
const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
|
||||
|
||||
__m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
|
||||
__m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
|
||||
__m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
|
||||
__m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
|
||||
|
||||
scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
|
||||
scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
|
||||
scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
|
||||
scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
|
||||
const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
|
||||
const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
|
||||
const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
|
||||
const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
|
||||
const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
|
||||
const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
|
||||
const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
|
||||
const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
|
||||
|
||||
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
||||
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
||||
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
|
||||
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
|
||||
|
||||
qs += 8; qh += 4;
|
||||
}
|
||||
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
||||
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
||||
|
||||
#else
|
||||
|
||||
int sum1[2], sum2[2], delta[4];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue