iq4_xs: ARM_NEON dot product
This commit is contained in:
parent
061a16f5a2
commit
a37980c3d0
1 changed files with 25 additions and 16 deletions
|
@ -10467,14 +10467,17 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
for (int ib = 0; ib < nb; ib += 2) {
|
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||||
|
|
||||||
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
const int8_t * q8 = y[ibl].qs;
|
||||||
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
const uint8_t * q4 = x[ibl].qs;
|
||||||
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
uint16_t h = x[ibl].scales_h;
|
||||||
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
|
||||||
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
int sumi1 = 0, sumi2 = 0;
|
||||||
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
for (int ib = 0; ib < QK_K/64; ++ib) {
|
||||||
|
|
||||||
|
q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
||||||
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
|
|
||||||
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
||||||
|
@ -10484,9 +10487,15 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||||
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
||||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||||
|
|
||||||
sumf +=
|
int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
|
||||||
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
|
int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
|
||||||
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
|
h >>= 4;
|
||||||
|
sumi1 += vaddvq_s32(prod_1) * ls1;
|
||||||
|
sumi2 += vaddvq_s32(prod_2) * ls2;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue