diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7c856e9ee..2f7f02162 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -242,7 +242,7 @@ static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStre int min_grid_size, block_size = 1; // Initialize to suppress compiler warning. CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_0, 0, 0)); int grid_size = (nb + block_size - 1) / block_size; // Round up. - dequantize_block_q4_0<<>>(vx, y, k); + dequantize_block_q4_0<<>>(vx, y, nb); } static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { @@ -250,7 +250,7 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre int min_grid_size, block_size = 1; // Initialize to suppress compiler warning. CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_1, 0, 0)); int grid_size = (nb + block_size - 1) / block_size; // Round up. - dequantize_block_q4_1<<>>(vx, y, k); + dequantize_block_q4_1<<>>(vx, y, nb); } static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { @@ -258,7 +258,7 @@ static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStre int min_grid_size, block_size = 1; // Initialize to suppress compiler warning. CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_2, 0, 0)); int grid_size = (nb + block_size - 1) / block_size; // Round up. - dequantize_block_q4_2<<>>(vx, y, k); + dequantize_block_q4_2<<>>(vx, y, nb); } static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { @@ -266,7 +266,7 @@ static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStre int min_grid_size, block_size = 1; // Initialize to suppress compiler warning. CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q5_0, 0, 0)); int grid_size = (nb + block_size - 1) / block_size; // Round up. - dequantize_block_q5_0<<>>(vx, y, k); + dequantize_block_q5_0<<>>(vx, y, nb); } static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { @@ -274,7 +274,7 @@ static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStre int min_grid_size, block_size = 1; // Initialize to suppress compiler warning. CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q5_1, 0, 0)); int grid_size = (nb + block_size - 1) / block_size; // Round up. - dequantize_block_q5_1<<>>(vx, y, k); + dequantize_block_q5_1<<>>(vx, y, nb); } static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { @@ -282,7 +282,7 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre int min_grid_size, block_size = 1; // Initialize to suppress compiler warning. CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q8_0, 0, 0)); int grid_size = (nb + block_size - 1) / block_size; // Round up. - dequantize_block_q8_0<<>>(vx, y, k); + dequantize_block_q8_0<<>>(vx, y, nb); } // TODO: optimize