iq2_xxs: slighty faster dot product

TG-128 is now 48.4 t/s
This commit is contained in:
Iwan Kawrakow 2024-01-03 19:32:57 +01:00
parent dd29610153
commit 1c96aa0d7f

View file

@ -3592,8 +3592,6 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
float yl[32]; float yl[32];
float sumf[N_DST]={0.f}, all_sum; float sumf[N_DST]={0.f}, all_sum;
const int step = sizeof(block_q2_K) * nb;
const int nb32 = nb * (QK_K / 32); const int nb32 = nb * (QK_K / 32);
#if QK_K == 256 #if QK_K == 256
@ -3611,11 +3609,12 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
const int ib = ib32 % (QK_K / 32); const int ib = ib32 % (QK_K / 32);
device const block_iq2_xxs * xr = x + ibl; device const block_iq2_xxs * xr = x + ibl;
device const uint16_t * q2 = xr->qs + 4 * ib;
device const half * dh = &xr->d;
for (int row = 0; row < N_DST; row++) { for (int row = 0; row < N_DST; row++) {
const float db = xr->d; const float db = dh[0];
device const uint16_t * q2 = xr->qs + 4 * ib;
device const uint8_t * aux8 = (device const uint8_t *)q2; device const uint8_t * aux8 = (device const uint8_t *)q2;
const uint32_t aux32 = q2[2] | (q2[3] << 16); const uint32_t aux32 = q2[2] | (q2[3] << 16);
const float d = db * (0.5f + (aux32 >> 28)); const float d = db * (0.5f + (aux32 >> 28));
@ -3630,7 +3629,8 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
} }
sumf[row] += d * sum; sumf[row] += d * sum;
xr += nb; dh += nb*sizeof(block_iq2_xxs)/2;
q2 += nb*sizeof(block_iq2_xxs)/2;
} }
y4 += 32 * 32; y4 += 32 * 32;