iq4_xs: ARM_NEON dot product
This commit is contained in:
parent
061a16f5a2
commit
a37980c3d0
1 changed files with 25 additions and 16 deletions
|
@ -10467,26 +10467,35 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|||
|
||||
float sumf = 0;
|
||||
|
||||
for (int ib = 0; ib < nb; ib += 2) {
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
|
||||
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
||||
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
||||
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
||||
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
||||
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
||||
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
const uint8_t * q4 = x[ibl].qs;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
|
||||
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
||||
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
||||
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int ib = 0; ib < QK_K/64; ++ib) {
|
||||
|
||||
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||
q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
||||
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||
|
||||
sumf +=
|
||||
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
|
||||
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
|
||||
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
||||
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
||||
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||
|
||||
int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
|
||||
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
||||
h >>= 4;
|
||||
sumi1 += vaddvq_s32(prod_1) * ls1;
|
||||
sumi2 += vaddvq_s32(prod_2) * ls2;
|
||||
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue