fixup! More GPU threads for CUDA kernels

2023-05-07 18:34:04 +02:00 · 2023-05-07 18:34:04 +02:00 · 8d8de07a4e
commit 8d8de07a4e
parent 50148408b5
1 changed files with 6 additions and 6 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -242,7 +242,7 @@ static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStre
    int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
    CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_0, 0, 0));
    int grid_size = (nb + block_size - 1) / block_size; // Round up.
-    dequantize_block_q4_0<<<grid_size, block_size, 0, stream>>>(vx, y, k);
+    dequantize_block_q4_0<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
 }

 static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -250,7 +250,7 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
    int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
    CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_1, 0, 0));
    int grid_size = (nb + block_size - 1) / block_size; // Round up.
-    dequantize_block_q4_1<<<grid_size, block_size, 0, stream>>>(vx, y, k);
+    dequantize_block_q4_1<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
 }

 static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -258,7 +258,7 @@ static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStre
    int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
    CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_2, 0, 0));
    int grid_size = (nb + block_size - 1) / block_size; // Round up.
-    dequantize_block_q4_2<<<grid_size, block_size, 0, stream>>>(vx, y, k);
+    dequantize_block_q4_2<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
 }

 static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -266,7 +266,7 @@ static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStre
    int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
    CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q5_0, 0, 0));
    int grid_size = (nb + block_size - 1) / block_size; // Round up.
-    dequantize_block_q5_0<<<grid_size, block_size, 0, stream>>>(vx, y, k);
+    dequantize_block_q5_0<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
 }

 static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -274,7 +274,7 @@ static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStre
    int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
    CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q5_1, 0, 0));
    int grid_size = (nb + block_size - 1) / block_size; // Round up.
-    dequantize_block_q5_1<<<grid_size, block_size, 0, stream>>>(vx, y, k);
+    dequantize_block_q5_1<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
 }

 static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -282,7 +282,7 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
    int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
    CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q8_0, 0, 0));
    int grid_size = (nb + block_size - 1) / block_size; // Round up.
-    dequantize_block_q8_0<<<grid_size, block_size, 0, stream>>>(vx, y, k);
+    dequantize_block_q8_0<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
 }

 // TODO: optimize