From 4aea48974064eb2360eec28458678df7dbaf235c Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 18 Jun 2023 18:12:34 +0300 Subject: [PATCH] k_quants: faster Q5_K on older GPUs 68.5 ms/tok -> 62.0 ms/tok on GTX-1660. For some reason the same access pattern that leads to such resounding success for Q2_K to Q4_K did not work at all for Q5_K. It is also more difficult to measure because for Q5_K_S we only have 32 layers on the GTX-1660, so output, tok embeddings and kv cache are done on the CPU. --- ggml-cuda.cu | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dc9f75a5f..9248d409f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -710,7 +710,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float const int il = tid/4; // 0...3 const int ir = tid - 4*il;// 0...3 - const int n = 4; + const int n = 2; const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const int in = il%2; @@ -749,11 +749,16 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float float4 sum = {0.f, 0.f, 0.f, 0.f}; float smin = 0; for (int l = 0; l < n; ++l) { - sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0)); - sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0)); - sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0)); - sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0)); - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) + + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) + + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) + + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) + + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; } tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;