Better 1.5 bit quantization (#5971)
* Trying blocvks of 16 for IQ1_S - seems slightly better * iq1s_blocks16: Adjust scale fudge factor to 1.125 * iq1s_blocks16: going to blocks of 32 with 2048 lattice points, so same bpw. This is even better than blocks of 16. Should I try blocks of 64? But to keep the same bpw, when I go to 4096 lattice points, I need to remove blocks alltogether and just have superblocks of 256 weights. * iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment * iq1s_blocks16: scalar and AVX2 dot products * iq1s_blocks16: CUDA dot product * iq1s_blocks16: Metal works, Neon does not Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s). Not seeing the bug in the Neon implementation for now. * iq1s_blocks16: fixed Neon * iq1s_blocks16: very slightly faster TG on Metal Still pathetic at 37 t/s * iq1s_blocks16: speedup Metal by packing codebook into uint32_t's * Formatting * iq1s_blocks16: uint32_t codebook is also better in CUDA TG-128 is now 204 t/s up from 194 t/s. PP-512 is 5890 t/s, so significantly better than other quants * iq1s_blocks16: slightly faster Neon dot product * iq1s_blocks16: faster AVX2 dot product * iq1s_blocks16: adjust to ggml-common.h --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
ef3ced26a3
commit
be858f6205
5 changed files with 1152 additions and 393 deletions
|
@ -2595,8 +2595,8 @@ typedef struct {
|
|||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/8];
|
||||
uint8_t scales[QK_K/16];
|
||||
uint8_t qs[QK_K/8];
|
||||
uint16_t qh[QK_K/32];
|
||||
} block_iq1_s;
|
||||
|
||||
// Non-linear quants
|
||||
|
@ -4338,48 +4338,53 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
|||
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[16];
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
const int ix = tiisg/2;
|
||||
const int il = tiisg%2;
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix + 16 * il;
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
sumy += yl[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq1_s * xr = x + ibl;
|
||||
device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
|
||||
device const uint8_t * sc = xr->scales + 2 * ib + il;
|
||||
device const half * dh = &xr->d;
|
||||
device const uint8_t * qs = xr->qs + 4 * ib;
|
||||
device const uint16_t * qh = xr->qh + ib;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
||||
constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 5) & 0x700)));
|
||||
constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[0] << 2) & 0x700)));
|
||||
constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[0] >> 1) & 0x700)));
|
||||
|
||||
float2 sum = {0};
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sum[0] += yl[j+ 0] * grid1[j];
|
||||
sum[1] += yl[j+ 8] * grid2[j];
|
||||
float sum = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sum += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4)
|
||||
+ yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4)
|
||||
+ yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4)
|
||||
+ yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4);
|
||||
}
|
||||
sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
|
||||
sumf[row] += (float)dh[0] * (sum - sumy) * (2*(qh[0] >> 12) + 1);
|
||||
|
||||
dh += nb*sizeof(block_iq1_s)/2;
|
||||
qs += nb*sizeof(block_iq1_s);
|
||||
sc += nb*sizeof(block_iq1_s);
|
||||
qh += nb*sizeof(block_iq1_s)/2;
|
||||
}
|
||||
|
||||
y4 += 16 * 32;
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
|
@ -5066,16 +5071,19 @@ void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 &
|
|||
template <typename type4x4>
|
||||
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
const float d = xb->d;
|
||||
device const uint8_t * qs = xb->qs + 2*il;
|
||||
device const uint8_t * sc = xb->scales + il;
|
||||
const float dl1 = d * (2*(sc[0] & 7) + 1);
|
||||
const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1);
|
||||
constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
|
||||
constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4+0][i%4] = dl1 * grid1[i];
|
||||
reg[i/4+2][i%4] = dl2 * grid2[i];
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint16_t * qh = xb->qh;
|
||||
const float dl = d * (2*(qh[ib32] >> 12) + 1);
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | (((qh[ib32] >> (6*il+0)) & 7) << 8)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | (((qh[ib32] >> (6*il+3)) & 7) << 8)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * (grid1[i] & 0xf) - dl;
|
||||
reg[1][i] = dl * (grid1[i] >> 4) - dl;
|
||||
reg[2][i] = dl * (grid2[i] & 0xf) - dl;
|
||||
reg[3][i] = dl * (grid2[i] >> 4) - dl;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue