fixup! More GPU threads for CUDA kernels

This commit is contained in:
JohannesGaessler 2023-05-07 18:34:04 +02:00
parent 50148408b5
commit 8d8de07a4e

View file

@ -242,7 +242,7 @@ static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStre
int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_0, 0, 0));
int grid_size = (nb + block_size - 1) / block_size; // Round up.
dequantize_block_q4_0<<<grid_size, block_size, 0, stream>>>(vx, y, k);
dequantize_block_q4_0<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
}
static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -250,7 +250,7 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_1, 0, 0));
int grid_size = (nb + block_size - 1) / block_size; // Round up.
dequantize_block_q4_1<<<grid_size, block_size, 0, stream>>>(vx, y, k);
dequantize_block_q4_1<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
}
static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -258,7 +258,7 @@ static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStre
int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q4_2, 0, 0));
int grid_size = (nb + block_size - 1) / block_size; // Round up.
dequantize_block_q4_2<<<grid_size, block_size, 0, stream>>>(vx, y, k);
dequantize_block_q4_2<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
}
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -266,7 +266,7 @@ static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStre
int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q5_0, 0, 0));
int grid_size = (nb + block_size - 1) / block_size; // Round up.
dequantize_block_q5_0<<<grid_size, block_size, 0, stream>>>(vx, y, k);
dequantize_block_q5_0<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
}
static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -274,7 +274,7 @@ static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStre
int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q5_1, 0, 0));
int grid_size = (nb + block_size - 1) / block_size; // Round up.
dequantize_block_q5_1<<<grid_size, block_size, 0, stream>>>(vx, y, k);
dequantize_block_q5_1<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
}
static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
@ -282,7 +282,7 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
int min_grid_size, block_size = 1; // Initialize to suppress compiler warning.
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, dequantize_block_q8_0, 0, 0));
int grid_size = (nb + block_size - 1) / block_size; // Round up.
dequantize_block_q8_0<<<grid_size, block_size, 0, stream>>>(vx, y, k);
dequantize_block_q8_0<<<grid_size, block_size, 0, stream>>>(vx, y, nb);
}
// TODO: optimize