A 10% faster CUDA vector dot kernel for Q3_K

Q3_K is now running at ~18.5 ms / token on CUDA, so the gap to Q4_0 is only 10%. It seems memory acccess pattern is more important for performance than the amount of computation the kernel does.
2023-06-01 15:22:12 +03:00 · 2023-06-01 15:22:12 +03:00 · 9a9c5a0c80
commit 9a9c5a0c80
parent c5959d53ff
1 changed files with 35 additions and 20 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -317,34 +317,49 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {

 }

-static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * y, float & result) {
+static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {

    const block_q3_K * x = (const block_q3_K *) vx;

-    int n = iqs / 128;
-    int iqsn = iqs - 128*n;
-    int j = iqsn / 32;
-    int l = iqsn - 32*j;
-    int shift = 2*j;
-    int iss = 2*j + l/16;
-    int is = 8*n + iss;
-    int is_shift = 2*(is/4);
-    uint8_t m = 1 << (4*n + j);
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;

-    const float d = x[ib].d;
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    // if n is 0, we want to do the lower 128, else the upper 128,
+    // covering y[l+0],  y[l+32], y[l+64], y[l+96] and
+    //          y[l+16], y[l+48], y[l+80], y[l+112]
+    int n = iqs/128;                // 0 or 1
+    int r = iqs - 128*n;            // 0...120 in steps of 8
+    int l = r/8;                    // 0...15 in steps of 1
+
+    const float   * y = yy + 128*n + l;
    const uint8_t * q = x[ib].qs + 32*n + l;
    const uint8_t * hm = x[ib].hmask + l;
+    const int8_t  * s = (const int8_t *)utmp + 8*n;

-    int8_t us = n == 0 ? (x[ib].scales[iss] & 0xF) | (((x[ib].scales[is+8-2*is_shift] >> is_shift) & 3) << 4)
-                       : (x[ib].scales[iss] >> 4 ) | (((x[ib].scales[is+8-2*is_shift] >> is_shift) & 3) << 4);
-    float scale = d * (us - 32);
+    memcpy(aux, x[ib].scales, 12);
+    utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+    utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+    utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+    utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+    const float dall = x[ib].d;
+
+    const uint8_t m = 1 << (4*n);
+
+    float sum = y[  0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+              + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+              + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+              + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+              + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+              + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+              + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+              + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
+
+    result = sum * dall;

-    float sum = 0;
-    for (int k = 0; k < 8; ++k) {
-        int8_t ql = (q[k] >> shift) & 3;
-        sum += y[iqs + k] * (ql - ((hm[k] & m) ? 0 : 4));
-    }
-    result = sum * scale;
 }

 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {