From 4c93e55996af94000930576293072ddeecebfd2d Mon Sep 17 00:00:00 2001 From: lijiahao Date: Sat, 26 Aug 2023 20:38:32 +0800 Subject: [PATCH 1/4] cuda: 1.2x faster dequantization kernel --- ggml-cuda.cu | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 83d53c13c..e5f827307 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4197,9 +4197,53 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con quantize_q8_1<<>>(x, vy, kx, kx_padded); } +static __device__ __forceinline__ dfloat2 dfmul2(dfloat2 a, dfloat2 b) { +#ifdef GGML_CUDA_F16 + return __hmul2(a, b); +#else + return make_float2(a.x * b.x, a.y * b.y); +#endif +} + +static __device__ __forceinline__ float2 dfloat22float2(dfloat2 a) { +#ifdef GGML_CUDA_F16 + return __half22float2(a); +#else + return a; +#endif +} + +static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, float * __restrict__ y, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i*4 >= k) { + return; + } + + const int ib = i/(QK4_0/4); + const int iqs = i%(QK4_0/4); + + const block_q4_0 * x = (const block_q4_0 *) vx; + const uchar2 qs = *(uchar2 *)(x[ib].qs + iqs*2); + const dfloat d = x[ib].d; + + dfloat2 dv0; + dv0.x = (int)(qs.x & 0xf) - 8; + dv0.y = (int)(qs.y & 0xf) - 8; + float2 v0 = dfloat22float2(dfmul2(dv0, {d, d})); + *(float2 *)(y + ib*QK4_0 + iqs*2) = v0; + + dfloat2 dv1; + dv1.x = (int)(qs.x >> 4) - 8; + dv1.y = (int)(qs.y >> 4) - 8; + float2 v1 = dfloat22float2(dfmul2(dv1, {d, d})); + *(float2 *)(y + ib*QK4_0 + QK4_0/2 + iqs*2) = v1; +} + static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; - dequantize_block<<>>(vx, y, k); + GGML_ASSERT(k % 4 == 0); + const int num_blocks = (k/4 + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + dequantize_block_q4_0<<>>(vx, y, k); } static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { From d01f52409fbf1abb41ec9bcd55d42d20dd3469d1 Mon Sep 17 00:00:00 2001 From: lijiahao Date: Sat, 26 Aug 2023 21:14:09 +0800 Subject: [PATCH 2/4] Added const if possible --- ggml-cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e5f827307..53f6891cb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4224,19 +4224,19 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, float const int iqs = i%(QK4_0/4); const block_q4_0 * x = (const block_q4_0 *) vx; - const uchar2 qs = *(uchar2 *)(x[ib].qs + iqs*2); + const uchar2 qs = *(const uchar2 *)(x[ib].qs + iqs*2); const dfloat d = x[ib].d; dfloat2 dv0; dv0.x = (int)(qs.x & 0xf) - 8; dv0.y = (int)(qs.y & 0xf) - 8; - float2 v0 = dfloat22float2(dfmul2(dv0, {d, d})); + const float2 v0 = dfloat22float2(dfmul2(dv0, {d, d})); *(float2 *)(y + ib*QK4_0 + iqs*2) = v0; dfloat2 dv1; dv1.x = (int)(qs.x >> 4) - 8; dv1.y = (int)(qs.y >> 4) - 8; - float2 v1 = dfloat22float2(dfmul2(dv1, {d, d})); + const float2 v1 = dfloat22float2(dfmul2(dv1, {d, d})); *(float2 *)(y + ib*QK4_0 + QK4_0/2 + iqs*2) = v1; } From af31f1f00d6fc6b44507597e1b09e6dad497582b Mon Sep 17 00:00:00 2001 From: lijiahao Date: Sun, 27 Aug 2023 11:06:28 +0800 Subject: [PATCH 3/4] Use make_half2 for better compatibility --- ggml-cuda.cu | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 53f6891cb..48587beaa 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4197,6 +4197,12 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con quantize_q8_1<<>>(x, vy, kx, kx_padded); } +#ifdef GGML_CUDA_F16 +#define make_dfloat2(x, y) make_half2((x), (y)) +#else +#define make_dfloat2(x, y) make_float2((x), (y)) +#endif + static __device__ __forceinline__ dfloat2 dfmul2(dfloat2 a, dfloat2 b) { #ifdef GGML_CUDA_F16 return __hmul2(a, b); @@ -4227,15 +4233,11 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, float const uchar2 qs = *(const uchar2 *)(x[ib].qs + iqs*2); const dfloat d = x[ib].d; - dfloat2 dv0; - dv0.x = (int)(qs.x & 0xf) - 8; - dv0.y = (int)(qs.y & 0xf) - 8; + dfloat2 dv0 = make_dfloat2((int)(qs.x & 0xf) - 8, (int)(qs.y & 0xf) - 8); const float2 v0 = dfloat22float2(dfmul2(dv0, {d, d})); *(float2 *)(y + ib*QK4_0 + iqs*2) = v0; - dfloat2 dv1; - dv1.x = (int)(qs.x >> 4) - 8; - dv1.y = (int)(qs.y >> 4) - 8; + dfloat2 dv1 = make_dfloat2((int)(qs.x >> 4) - 8, (int)(qs.y >> 4) - 8); const float2 v1 = dfloat22float2(dfmul2(dv1, {d, d})); *(float2 *)(y + ib*QK4_0 + QK4_0/2 + iqs*2) = v1; } @@ -5755,6 +5757,7 @@ inline void ggml_cuda_op_alibi( (void) src1; (void) src0_ddq_i; (void) src1_ddf_i; + (void) i02; (void) i1; } From 2d7a0fbe68be0dd273d17e530a6633c3e953d2ab Mon Sep 17 00:00:00 2001 From: lijiahao Date: Sun, 27 Aug 2023 11:14:32 +0800 Subject: [PATCH 4/4] Replace make_half2 with __halves2half2 --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 48587beaa..3ff04068c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4198,7 +4198,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con } #ifdef GGML_CUDA_F16 -#define make_dfloat2(x, y) make_half2((x), (y)) +#define make_dfloat2(x, y) __halves2half2((x), (y)) #else #define make_dfloat2(x, y) make_float2((x), (y)) #endif