From 5a74dc1536e9c0707df922672bcd453da28646fe Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 14 May 2023 22:19:54 +0200 Subject: [PATCH] Add remaining dequant_mul_mat functions --- ggml-opencl.cpp | 329 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 303 insertions(+), 26 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index a4d2e1201..c842977b5 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -21,6 +21,13 @@ typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; +constant uint GGML_TYPE_Q4_0 = 2; +constant uint GGML_TYPE_Q4_1 = 3; +constant uint GGML_TYPE_Q5_0 = 6; +constant uint GGML_TYPE_Q5_1 = 7; +constant uint GGML_TYPE_Q8_0 = 8; +constant uint GGML_TYPE_Q8_1 = 9; + constant uint QK4_0 = 32; constant uint QR4_0 = 2; struct block_q4_0 @@ -154,7 +161,19 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y[i*qk + j] = x[i].qs[j]*d; } -__kernel void dequantize_mul_mat_vec(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, int ncols) { +void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = x[ib].d; + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + *v0 = (vi0 - 8)*d; + *v1 = (vi1 - 8)*d; +} + +__kernel void dequantize_mul_mat_vec_q4_0(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); const int row = get_global_id(0) / block_size; const int tid = get_local_id(0); @@ -173,15 +192,262 @@ __kernel void dequantize_mul_mat_vec(__global struct block_q4_0* x, __local floa const int iybs = col - col%qk; // y block start index // dequantize - const float d = x[ib].d; + float v0, v1; + dequantize_q4_0(x, ib, iqs, &v0, &v1); - const uint8_t vui = x[ib].qs[iqs]; + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} - float v0 = (vi0 - 8)*d; - float v1 = (vi1 - 8)*d; +void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = x[ib].d; + const float m = x[ib].m; + + const uint8_t vui = x[ib].qs[iqs]; + + const int8_t vi0 = vui & 0xF; + const int8_t vi1 = vui >> 4; + + *v0 = vi0*d + m; + *v1 = vi1*d + m; +} +__kernel void dequantize_mul_mat_vec_q4_1(__global struct block_q4_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK4_1; + const uint qr = QR4_1; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q4_1(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, (__global half*) &x[ib].d); + + uint32_t qh = x[ib].qh; + + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; + + *v0 = x0*d; + *v1 = x1*d; +} +__kernel void dequantize_mul_mat_vec_q5_0(__global struct block_q5_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK5_0; + const uint qr = QR5_0; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q5_0(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = vload_half(0, (__global half*) &x[ib].d); + const float m = vload_half(0, (__global half*) &x[ib].m); + + uint32_t qh = x[ib].qh; + + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); + + *v0 = x0*d + m; + *v1 = x1*d + m; +} +__kernel void dequantize_mul_mat_vec_q5_1(__global struct block_q5_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK5_1; + const uint qr = QR5_1; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q5_1(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { + const float d = x[ib].d; + + const int8_t vi0 = x[ib].qs[iqs + 0]; + const int8_t vi1 = x[ib].qs[iqs + 1]; + + *v0 = vi0*d; + *v1 = vi1*d; +} +__kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = QK8_0; + const uint qr = QR8_0; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // dequantize + float v0, v1; + dequantize_q8_0(x, ib, iqs, &v0, &v1); + + // matrix multiplication + tmp[tid] += v0 * y[iybs + iqs + 0]; + tmp[tid] += v1 * y[iybs + iqs + y_offset]; + } + + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} + +void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ + *v0 = vload_half(0, (__global half*) &x[ib + 0]); + *v1 = vload_half(0, (__global half*) &x[ib + 1]); +} +__kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); + + const uint qk = 32; + const uint qr = 1; + + const int y_offset = qr == 1 ? 1 : qk/2; + + tmp[tid] = 0; + + for (int i = 0; i < ncols/block_size; i += 2) { + const int col = i*block_size + 2*tid; + const int ib = (row*ncols + col)/qk; // block index + const int iqs = (col%qk)/qr; // quant index + const int iybs = col - col%qk; // y block start index + + // convert + float v0, v1; + convert_f16(x, ib, iqs, &v0, &v1); // matrix multiplication tmp[tid] += v0 * y[iybs + iqs + 0]; @@ -218,7 +484,7 @@ static cl_command_queue queue; static cl_program program; static cl_kernel convert_fp16_to_fp32_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; -static cl_kernel dequantize_mul_mat_vec_cl; +static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; static bool fp16_support; static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { @@ -312,7 +578,17 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); // dequant mul mat kernel - dequantize_mul_mat_vec_cl = clCreateKernel(program, "dequantize_mul_mat_vec", &err); + dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err); + CL_CHECK(err, "clCreateKernel"); + dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err); + CL_CHECK(err, "clCreateKernel"); + convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err); CL_CHECK(err, "clCreateKernel"); } @@ -338,17 +614,17 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: - return &dequantize_mul_mat_vec_cl; - // case GGML_TYPE_Q4_1: - // return dequantize_mul_mat_vec_q4_1_cl; - // case GGML_TYPE_Q5_0: - // return dequantize_mul_mat_vec_q5_0_cl; - // case GGML_TYPE_Q5_1: - // return dequantize_mul_mat_vec_q5_1_cl; - // case GGML_TYPE_Q8_0: - // return dequantize_mul_mat_vec_q8_0_cl; - // case GGML_TYPE_F16: - // return convert_mul_mat_vec_f16_cl; + return &dequantize_mul_mat_vec_q4_0_cl; + case GGML_TYPE_Q4_1: + return &dequantize_mul_mat_vec_q4_1_cl; + case GGML_TYPE_Q5_0: + return &dequantize_mul_mat_vec_q5_0_cl; + case GGML_TYPE_Q5_1: + return &dequantize_mul_mat_vec_q5_1_cl; + case GGML_TYPE_Q8_0: + return &dequantize_mul_mat_vec_q8_0_cl; + case GGML_TYPE_F16: + return &convert_mul_mat_vec_f16_cl; default: return nullptr; } @@ -646,6 +922,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); + cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); GGML_ASSERT(to_fp32_cl != nullptr); for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -669,13 +946,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; const size_t local = CL_DMMV_BLOCK_SIZE; const cl_int ncols = ne00; - CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 1, sizeof(float) * local, NULL), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 3, sizeof(cl_mem), &d_D), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 4, sizeof(cl_int), &ncols), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D), "clSetKernelArg"); + CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols), "clSetKernelArg"); CL_CHECK(clFinish(queue), "clFinish"); - CL_CHECK(clEnqueueNDRangeKernel(queue, dequantize_mul_mat_vec_cl, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel"); + CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel"); } else { // general dequantization kernel + CLBlast matrix matrix multiplication // convert src0 to fp32 on device const size_t global = x_ne;