iq3_s_mult: remove SLOW_MULT option

This commit is contained in:
Iwan Kawrakow 2024-03-05 08:23:37 +02:00
parent a6a263b919
commit b1d753be34
2 changed files with 2 additions and 119 deletions

View file

@ -2546,14 +2546,8 @@ typedef struct {
uint8_t signs[QK_K/8];
uint8_t scales[IQ3S_N_SCALE];
} block_iq3_s;
#ifdef IQ3S_SLOW_MULT
#define IQ3S_MULTIPLIER 190842953
#else
//#define IQ3S_MULTIPLIER 898886
//#define IQ3S_MULTIPLIER 842866
#define IQ3S_MULTIPLIER 72968561ULL
constexpr constant static uint8_t iq3s_values[16] = {1, 1, 1, 3, 3, 3, 5, 5, 5, 7, 7, 9, 9, 11, 13, 15};
#endif
typedef struct {
half d;
@ -4702,19 +4696,11 @@ void kernel_mul_mv_iq3_s_f32_impl(
int pos = (32*sgitg + tiisg)*nval;
uint32_t aux32;
thread int8_t * q = (thread int8_t *)&aux32;
#ifdef IQ3S_SLOW_MULT
for (int i = 0; i < nval; ++i) {
aux32 = (IQ3S_MULTIPLIER * (pos + i)) & 0x0f0f0f0f;
for (int k = 0; k < 4; ++k) q[k] = 2*((q[k]-1)/2) + 1;
values[pos + i] = aux32;
}
#else
for (int i = 0; i < nval; ++i) {
aux32 = (IQ3S_MULTIPLIER * (pos + i)) & 0x0f0f0f0f;
for (int k = 0; k < 4; ++k) q[k] = iq3s_values[q[k]];
values[pos + i] = aux32;
}
#endif
threadgroup_barrier(mem_flags::mem_threadgroup);
}
@ -5671,20 +5657,6 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 &
const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
uint32_t aux32[2];
thread const int8_t * grid = (thread const int8_t *)aux32;
#ifdef IQ3S_SLOW_MULT
aux32[0] = (IQ3S_MULTIPLIER * (qs[4*il+0] | ((qh << 8) & 256))) & 0x0f0f0f0f;
aux32[1] = (IQ3S_MULTIPLIER * (qs[4*il+1] | ((qh << 7) & 256))) & 0x0f0f0f0f;
for (int i = 0; i < 4; ++i) {
reg[0][i] = dl * (2*((grid[i+0]-1)/2)+1) * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
reg[1][i] = dl * (2*((grid[i+4]-1)/2)+1) * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
}
aux32[0] = (IQ3S_MULTIPLIER * (qs[4*il+2] | ((qh << 6) & 256))) & 0x0f0f0f0f;
aux32[1] = (IQ3S_MULTIPLIER * (qs[4*il+3] | ((qh << 5) & 256))) & 0x0f0f0f0f;
for (int i = 0; i < 4; ++i) {
reg[2][i] = dl * (2*((grid[i+0]-1)/2)+1) * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
reg[3][i] = dl * (2*((grid[i+4]-1)/2)+1) * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
}
#else
aux32[0] = (IQ3S_MULTIPLIER * (qs[4*il+0] | ((qh << 8) & 256))) & 0x0f0f0f0f;
aux32[1] = (IQ3S_MULTIPLIER * (qs[4*il+1] | ((qh << 7) & 256))) & 0x0f0f0f0f;
for (int i = 0; i < 4; ++i) {
@ -5697,7 +5669,6 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 &
reg[2][i] = dl * iq3s_values[grid[i+0]] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
reg[3][i] = dl * iq3s_values[grid[i+4]] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
}
#endif
}
template <typename type4x4>