From 62832c57c40f181232ab28a5df01740d3fafa9e1 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 30 Sep 2023 12:21:56 +0200 Subject: [PATCH] ggml-cuda : perform cublas matrix multiplication of quantized types as fp16 --- ggml-cuda.cu | 111 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 80 insertions(+), 31 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 86d1fe203..153bd1fb9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -715,7 +715,8 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in //================================== k-quants -static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const int i = blockIdx.x; const block_q2_K * x = (const block_q2_K *) vx; @@ -727,7 +728,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int is = 8*n + l/16; const uint8_t q = x[i].qs[32*n + l]; - float * y = yy + i*QK_K + 128*n; + dst_t * y = yy + i*QK_K + 128*n; float dall = __low2half(x[i].dm); float dmin = __high2half(x[i].dm); @@ -739,7 +740,7 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float const int is = tid/16; // 0 or 1 const int il = tid%16; // 0...15 const uint8_t q = x[i].qs[il] >> (2*is); - float * y = yy + i*QK_K + 16*is + il; + dst_t * y = yy + i*QK_K + 16*is + il; float dall = __low2half(x[i].dm); float dmin = __high2half(x[i].dm); y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); @@ -748,7 +749,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float } -static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const int i = blockIdx.x; const block_q3_K * x = (const block_q3_K *) vx; @@ -772,7 +774,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float float d_all = x[i].d; float dl = d_all * (us - 32); - float * y = yy + i*QK_K + 128*n + 32*j; + dst_t * y = yy + i*QK_K + 128*n + 32*j; const uint8_t * q = x[i].qs + 32*n; const uint8_t * hm = x[i].hmask; @@ -784,7 +786,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float const int im = il/8; // 0...1 const int in = il%8; // 0...7 - float * y = yy + i*QK_K + 16*is + il; + dst_t * y = yy + i*QK_K + 16*is + il; const uint8_t q = x[i].qs[il] >> (2*is); const uint8_t h = x[i].hmask[in] >> (2*is + im); @@ -812,7 +814,8 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t } #endif -static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q4_K * x = (const block_q4_K *) vx; const int i = blockIdx.x; @@ -825,7 +828,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float const int is = 2*il; const int n = 4; - float * y = yy + i*QK_K + 64*il + n*ir; + dst_t * y = yy + i*QK_K + 64*il + n*ir; const float dall = __low2half(x[i].dm); const float dmin = __high2half(x[i].dm); @@ -844,7 +847,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float #else const int tid = threadIdx.x; const uint8_t * q = x[i].qs; - float * y = yy + i*QK_K; + dst_t * y = yy + i*QK_K; const float d = (float)x[i].dm[0]; const float m = (float)x[i].dm[1]; y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); @@ -852,7 +855,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float #endif } -static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q5_K * x = (const block_q5_K *) vx; const int i = blockIdx.x; @@ -864,7 +868,7 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float const int ir = tid%16; // ir is in 0...15 const int is = 2*il; // is is in 0...6 - float * y = yy + i*QK_K + 64*il + 2*ir; + dst_t * y = yy + i*QK_K + 64*il + 2*ir; const float dall = __low2half(x[i].dm); const float dmin = __high2half(x[i].dm); @@ -892,13 +896,14 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float const int is = tid/16; // 0 or 1 const uint8_t h = x[i].qh[in] >> im; const float d = x[i].d; - float * y = yy + i*QK_K + tid; + dst_t * y = yy + i*QK_K + tid; y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); #endif } -static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) { +template +static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q6_K * x = (const block_q6_K *) vx; const int i = blockIdx.x; @@ -910,7 +915,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float const int il = tid - 32*ip; // 0...32 const int is = 8*ip + il/16; - float * y = yy + i*QK_K + 128*ip + il; + dst_t * y = yy + i*QK_K + 128*ip + il; const float d = x[i].d; @@ -929,7 +934,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float const int ip = tid/16; // 0 or 1 const int il = tid - 16*ip; // 0...15 - float * y = yy + i*QK_K + 16*ip + il; + dst_t * y = yy + i*QK_K + 16*ip + il; const float d = x[i].d; @@ -4604,32 +4609,38 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con quantize_q8_1<<>>(x, vy, kx, kx_padded); } -static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q5_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q5_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q8_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<<>>(vx, y, k); } -static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q2_K<<>>(vx, y); @@ -4638,7 +4649,8 @@ static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cu #endif } -static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q3_K<<>>(vx, y); @@ -4647,12 +4659,14 @@ static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cu #endif } -static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; dequantize_block_q4_K<<>>(vx, y); } -static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q5_K<<>>(vx, y); @@ -4660,8 +4674,8 @@ static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cu dequantize_block_q5_K<<>>(vx, y); #endif } - -static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { +template +static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; #if QK_K == 256 dequantize_block_q6_K<<>>(vx, y); @@ -4868,6 +4882,26 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_cuda; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_cuda; + case GGML_TYPE_Q5_0: + return dequantize_row_q5_0_cuda; + case GGML_TYPE_Q5_1: + return dequantize_row_q5_1_cuda; + case GGML_TYPE_Q8_0: + return dequantize_row_q8_0_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; case GGML_TYPE_F32: return convert_fp32_to_fp16_cuda; default: @@ -6083,8 +6117,19 @@ inline void ggml_cuda_op_mul_mat_cublas( const int compute_capability = g_compute_capabilities[id]; - if (compute_capability >= CC_TURING && src0->type == GGML_TYPE_F16 && ggml_is_contiguous(src0) && ldc == row_diff) { - // convert src1 to fp16, multiply as fp16, convert dst to fp32 + if (compute_capability >= CC_TURING && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && ldc == row_diff) { + // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 + half * src0_as_f16 = nullptr; + size_t src0_as = 0; + if (src0->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + size_t ne = row_diff*ne00; + src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as); + to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream); + } + const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (half *) src0_dd_i : src0_as_f16; + half * src1_as_f16 = nullptr; size_t src1_as = 0; if (src1->type != GGML_TYPE_F16) { @@ -6106,9 +6151,9 @@ inline void ggml_cuda_op_mul_mat_cublas( CUBLAS_CHECK( cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, row_diff, src1_ncols, ne10, - &alpha_f16, src0_dd_i, CUDA_R_16F, ne00, - src1_ptr, CUDA_R_16F, ne10, - &beta_f16, dst_f16, CUDA_R_16F, ldc, + &alpha_f16, src0_ptr, CUDA_R_16F, ne00, + src1_ptr, CUDA_R_16F, ne10, + &beta_f16, dst_f16, CUDA_R_16F, ldc, CUBLAS_COMPUTE_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -6117,6 +6162,10 @@ inline void ggml_cuda_op_mul_mat_cublas( ggml_cuda_pool_free(dst_f16, dst_as); + if (src0_as != 0) { + ggml_cuda_pool_free(src0_as_f16, src0_as); + } + if (src1_as != 0) { ggml_cuda_pool_free(src1_as_f16, src1_as); }