iq3_s_mult: alternative multiplier / bit twidling

2024-03-03 08:51:28 +02:00 · 2024-03-03 08:51:28 +02:00 · 726aed307a
commit 726aed307a
parent fe3c20b251
2 changed files with 66 additions and 21 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2370,9 +2370,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds

 }

-//#define IQ3S_MULTIPLIER 746226
-//#define IQ3S_MULTIPLIER 677595
-#define IQ3S_MULTIPLIER 190842953LL
+//#define IQ3S_MULTIPLIER 190842953LL
+
+//#define IQ3S_MULTIPLIER 5718026
+#define IQ3S_MULTIPLIER 898886

 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
@ -2390,11 +2391,17 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
    const int8_t * grid = (const int8_t *)aux32;
    const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
    const uint8_t signs = x[i].signs[4*ib + il];
-    aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
-    aux32[1] = ((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
+    //aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
+    //aux32[1] = ((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
+    aux32[0] = (((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
+    aux32[1] = (((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f) | 0x01010101;
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
-    aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
+    //aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
+    //aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0x00000000) >> 1) & 0x07070707) << 1) | 0x01010101;
+    //aux32[0] = ((__vsub4(aux32[0], 0x01010101) >> 1) << 1) | 0x01010101;
+    //aux32[1] = ((__vsub4(aux32[1], 0x01010101) >> 1) << 1) | 0x01010101;
+    aux32[0] = ((aux32[0] >> 1) << 1) | 0x01010101;
+    aux32[1] = ((aux32[1] >> 1) << 1) | 0x01010101;
    uint32_t signs0 = __vcmpeq4(((signs & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
    uint32_t signs1 = __vcmpeq4(((signs >>  4) * 0x01010101) & 0x08040201, 0x08040201);
    aux32[0] = __vsub4(aux32[0] ^ signs0, signs0);
@ -5220,10 +5227,16 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
    const int8_t   * q8 = bq8_1[ib32].qs;
    int sumi = 0;
    for (int l = 0; l < 4; ++l) {
+        //aux32[0] = ((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
+        //aux32[1] = ((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
+        //aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
+        //aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
        aux32[0] = ((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
        aux32[1] = ((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
-        aux32[0] = (((__vmaxs4(__vsub4(aux32[0], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
-        aux32[1] = (((__vmaxs4(__vsub4(aux32[1], 0x01010101), 0) >> 1) & 0x07070707) << 1) | 0x01010101;
+        //aux32[0] = ((__vsub4(aux32[0], 0x01010101) >> 1) << 1) | 0x01010101;
+        //aux32[1] = ((__vsub4(aux32[1], 0x01010101) >> 1) << 1) | 0x01010101;
+        aux32[0] = ((aux32[0] >> 1) << 1) | 0x01010101;
+        aux32[1] = ((aux32[1] >> 1) << 1) | 0x01010101;
        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
        const int grid_l = __vsub4(aux32[0] ^ signs0, signs0);