diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 73be3a3cd..d18c05204 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2349,6 +2349,9 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per + // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales + // is better amortized. mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); } @@ -2358,6 +2361,9 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per + // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales + // is better amortized. mul_mat_vec_q <<>>(vx, vy, dst, ncols, nrows); }