From 12fc292ee64bc90ddf3c6c421ec576e87a736733 Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Fri, 12 May 2023 12:42:09 +0200 Subject: [PATCH] Added q4_1 via template --- ggml-cuda.cu | 88 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 99c3ea808..0674bd3c9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -32,7 +32,9 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); } \ } while (0) +typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1); typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); +typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream); #define QK4_0 32 typedef struct { @@ -73,6 +75,37 @@ typedef struct { } block_q8_0; static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); +#define CUDA_DMMV_BLOCK_SIZE 32 + +static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){ + const block_q4_0 * x = (const block_q4_0 *) vx; + + const float d = x[ib].d; + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + v0 = (vi0 - 8)*d; + v1 = (vi1 - 8)*d; +} + +static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){ + const block_q4_1 * x = (const block_q4_1 *) vx; + + const float d = x[ib].d; + const float m = x[ib].m; + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + v0 = vi0*d + m; + v1 = vi1*d + m; +} + static __global__ void dequantize_block_q4_0(const void * vx, float * y) { static const int qk = QK4_0; @@ -173,10 +206,7 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) { } } -template static __global__ void dequantize_mul_mat_q4_0(const void * vx, const float * y, float * dst, const int ncols) { - const block_q4_0 * x = (const block_q4_0 *) vx; - const int qk = QK4_0; - +template static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) { const int row = blockIdx.x; const int tid = threadIdx.x; @@ -190,17 +220,8 @@ template static __global__ void dequantize_mul_mat_q4_0(const v const int iybs = col - col%qk; // y block start index // dequantize - const float d = x[ib].d; - - const uint8_t * pp = x[ib].qs; - - const uint8_t vui = pp[iqs]; - - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; - - const float v0 = (vi0 - 8)*d; - const float v1 = (vi1 - 8)*d; + float v0, v1; + dequantize_kernel(vx, ib, iqs, v0, v1); // matrix multiplication tmp[tid] += v0 * y[iybs + iqs + 0]; @@ -244,21 +265,14 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre dequantize_block_q8_0<<>>(vx, y); } -static void dequantize_mul_mat_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - // static int block_size = -1; - // if (block_size == -1) { - // int min_grid_size, max_block_size = 1; - // CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, dequantize_mul_mat_q4_0<256>, 0, 0)); - // max_block_size = min(max_block_size, GGML_CUDA_MAX_BLOCK_SIZE); - // block_size = 1; - // while (block_size*2 <= max_block_size && block_size*2 % ncols == 0) { - // block_size *= 2; - // } - // } - // dequantize_mul_mat_q4_0<<>>(vx, y, dst, ncols); - const int block_size = 32; - GGML_ASSERT(ncols % block_size == 0); - dequantize_mul_mat_q4_0<<>>(vx, y, dst, ncols); +static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); + dequantize_mul_mat_vec<<>>(vx, y, dst, ncols); +} + +static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0); + dequantize_mul_mat_vec<<>>(vx, y, dst, ncols); } // TODO: optimize @@ -293,6 +307,17 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { } } +static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_mul_mat_vec_q4_0_cuda; + case GGML_TYPE_Q4_1: + return dequantize_mul_mat_vec_q4_1_cuda; + default: + return nullptr; + } +} + // buffer pool for cuda #define MAX_CUDA_BUFFERS 256 @@ -610,6 +635,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type); + dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type); GGML_ASSERT(to_fp32_cuda != nullptr); for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -641,7 +667,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0)); // compute - dequantize_mul_mat_q4_0_cuda(c_Q, c_Y, c_D, ne00, ne01, cudaStream); + dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream); CUDA_CHECK(cudaGetLastError()); } else {