Add some comments to satisfy PR reviewer

2023-07-23 23:26:03 +03:00 · 2023-07-23 23:26:03 +03:00 · f3a92117a7
commit f3a92117a7
parent 6baa4ead58
1 changed files with 6 additions and 0 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2349,6 +2349,9 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
    //       kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
    //       is better amortized.
    mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
@ -2358,6 +2361,9 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
    //       kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
    //       is better amortized.
    mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }