k_quants: faster Q5_K on older GPUs

68.5 ms/tok -> 62.0 ms/tok on GTX-1660.
For some reason the same access pattern that leads
to such resounding success for Q2_K to Q4_K did not
work at all for Q5_K.

It is also more difficult to measure because for Q5_K_S
we only have 32 layers on the GTX-1660, so output, tok embeddings
and kv cache are done on the CPU.
This commit is contained in:
Iwan Kawrakow 2023-06-18 18:12:34 +03:00
parent d6daebcb0c
commit 4aea489740

View file

@ -710,7 +710,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
const int n = 4;
const int n = 2;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
@ -749,11 +749,16 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
float4 sum = {0.f, 0.f, 0.f, 0.f};
float smin = 0;
for (int l = 0; l < n; ++l) {
sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0));
sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0));
sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0));
sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0));
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
}
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;