iq1_s
This commit is contained in:
parent
5ff64adfe4
commit
75370d779e
1 changed files with 48 additions and 0 deletions
|
@ -10794,6 +10794,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|||
|
||||
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||
|
||||
#elif defined __AVX__
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
float accum1 = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
__m128i sumi1_0 = _mm_setzero_si128();
|
||||
__m128i sumi1_1 = _mm_setzero_si128();
|
||||
int sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
||||
const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
|
||||
const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
||||
const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
|
||||
qs += 8;
|
||||
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
||||
|
||||
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
||||
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
||||
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
||||
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
||||
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
||||
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
||||
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
|
||||
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
|
||||
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
|
||||
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
|
||||
|
||||
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
||||
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
||||
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
||||
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
||||
accum1 += d * sumi1;
|
||||
|
||||
}
|
||||
|
||||
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
||||
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue