Add remaining dequant_mul_mat functions
This commit is contained in:
parent
883e587a04
commit
5a74dc1536
1 changed files with 303 additions and 26 deletions
329
ggml-opencl.cpp
329
ggml-opencl.cpp
|
@ -21,6 +21,13 @@ typedef uchar uint8_t;
|
|||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
constant uint GGML_TYPE_Q4_0 = 2;
|
||||
constant uint GGML_TYPE_Q4_1 = 3;
|
||||
constant uint GGML_TYPE_Q5_0 = 6;
|
||||
constant uint GGML_TYPE_Q5_1 = 7;
|
||||
constant uint GGML_TYPE_Q8_0 = 8;
|
||||
constant uint GGML_TYPE_Q8_1 = 9;
|
||||
|
||||
constant uint QK4_0 = 32;
|
||||
constant uint QR4_0 = 2;
|
||||
struct block_q4_0
|
||||
|
@ -154,7 +161,19 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float*
|
|||
y[i*qk + j] = x[i].qs[j]*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_mul_mat_vec(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, int ncols) {
|
||||
void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
|
||||
const float d = x[ib].d;
|
||||
|
||||
const uint8_t vui = x[ib].qs[iqs];
|
||||
|
||||
const int8_t vi0 = vui & 0xF;
|
||||
const int8_t vi1 = vui >> 4;
|
||||
|
||||
*v0 = (vi0 - 8)*d;
|
||||
*v1 = (vi1 - 8)*d;
|
||||
}
|
||||
|
||||
__kernel void dequantize_mul_mat_vec_q4_0(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
const int tid = get_local_id(0);
|
||||
|
@ -173,15 +192,262 @@ __kernel void dequantize_mul_mat_vec(__global struct block_q4_0* x, __local floa
|
|||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// dequantize
|
||||
const float d = x[ib].d;
|
||||
float v0, v1;
|
||||
dequantize_q4_0(x, ib, iqs, &v0, &v1);
|
||||
|
||||
const uint8_t vui = x[ib].qs[iqs];
|
||||
// matrix multiplication
|
||||
tmp[tid] += v0 * y[iybs + iqs + 0];
|
||||
tmp[tid] += v1 * y[iybs + iqs + y_offset];
|
||||
}
|
||||
|
||||
const int8_t vi0 = vui & 0xF;
|
||||
const int8_t vi1 = vui >> 4;
|
||||
// sum up partial sums and write back result
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for (int s=block_size/2; s>0; s>>=1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (tid == 0) {
|
||||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
float v0 = (vi0 - 8)*d;
|
||||
float v1 = (vi1 - 8)*d;
|
||||
void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
|
||||
const float d = x[ib].d;
|
||||
const float m = x[ib].m;
|
||||
|
||||
const uint8_t vui = x[ib].qs[iqs];
|
||||
|
||||
const int8_t vi0 = vui & 0xF;
|
||||
const int8_t vi1 = vui >> 4;
|
||||
|
||||
*v0 = vi0*d + m;
|
||||
*v1 = vi1*d + m;
|
||||
}
|
||||
__kernel void dequantize_mul_mat_vec_q4_1(__global struct block_q4_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const uint qk = QK4_1;
|
||||
const uint qr = QR4_1;
|
||||
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
tmp[tid] = 0;
|
||||
|
||||
for (int i = 0; i < ncols/block_size; i += 2) {
|
||||
const int col = i*block_size + 2*tid;
|
||||
const int ib = (row*ncols + col)/qk; // block index
|
||||
const int iqs = (col%qk)/qr; // quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// dequantize
|
||||
float v0, v1;
|
||||
dequantize_q4_1(x, ib, iqs, &v0, &v1);
|
||||
|
||||
// matrix multiplication
|
||||
tmp[tid] += v0 * y[iybs + iqs + 0];
|
||||
tmp[tid] += v1 * y[iybs + iqs + y_offset];
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for (int s=block_size/2; s>0; s>>=1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (tid == 0) {
|
||||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
|
||||
const float d = vload_half(0, (__global half*) &x[ib].d);
|
||||
|
||||
uint32_t qh = x[ib].qh;
|
||||
|
||||
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
|
||||
const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
|
||||
|
||||
*v0 = x0*d;
|
||||
*v1 = x1*d;
|
||||
}
|
||||
__kernel void dequantize_mul_mat_vec_q5_0(__global struct block_q5_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const uint qk = QK5_0;
|
||||
const uint qr = QR5_0;
|
||||
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
tmp[tid] = 0;
|
||||
|
||||
for (int i = 0; i < ncols/block_size; i += 2) {
|
||||
const int col = i*block_size + 2*tid;
|
||||
const int ib = (row*ncols + col)/qk; // block index
|
||||
const int iqs = (col%qk)/qr; // quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// dequantize
|
||||
float v0, v1;
|
||||
dequantize_q5_0(x, ib, iqs, &v0, &v1);
|
||||
|
||||
// matrix multiplication
|
||||
tmp[tid] += v0 * y[iybs + iqs + 0];
|
||||
tmp[tid] += v1 * y[iybs + iqs + y_offset];
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for (int s=block_size/2; s>0; s>>=1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (tid == 0) {
|
||||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
|
||||
const float d = vload_half(0, (__global half*) &x[ib].d);
|
||||
const float m = vload_half(0, (__global half*) &x[ib].m);
|
||||
|
||||
uint32_t qh = x[ib].qh;
|
||||
|
||||
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
||||
const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
|
||||
|
||||
*v0 = x0*d + m;
|
||||
*v1 = x1*d + m;
|
||||
}
|
||||
__kernel void dequantize_mul_mat_vec_q5_1(__global struct block_q5_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const uint qk = QK5_1;
|
||||
const uint qr = QR5_1;
|
||||
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
tmp[tid] = 0;
|
||||
|
||||
for (int i = 0; i < ncols/block_size; i += 2) {
|
||||
const int col = i*block_size + 2*tid;
|
||||
const int ib = (row*ncols + col)/qk; // block index
|
||||
const int iqs = (col%qk)/qr; // quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// dequantize
|
||||
float v0, v1;
|
||||
dequantize_q5_1(x, ib, iqs, &v0, &v1);
|
||||
|
||||
// matrix multiplication
|
||||
tmp[tid] += v0 * y[iybs + iqs + 0];
|
||||
tmp[tid] += v1 * y[iybs + iqs + y_offset];
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for (int s=block_size/2; s>0; s>>=1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (tid == 0) {
|
||||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
|
||||
const float d = x[ib].d;
|
||||
|
||||
const int8_t vi0 = x[ib].qs[iqs + 0];
|
||||
const int8_t vi1 = x[ib].qs[iqs + 1];
|
||||
|
||||
*v0 = vi0*d;
|
||||
*v1 = vi1*d;
|
||||
}
|
||||
__kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const uint qk = QK8_0;
|
||||
const uint qr = QR8_0;
|
||||
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
tmp[tid] = 0;
|
||||
|
||||
for (int i = 0; i < ncols/block_size; i += 2) {
|
||||
const int col = i*block_size + 2*tid;
|
||||
const int ib = (row*ncols + col)/qk; // block index
|
||||
const int iqs = (col%qk)/qr; // quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// dequantize
|
||||
float v0, v1;
|
||||
dequantize_q8_0(x, ib, iqs, &v0, &v1);
|
||||
|
||||
// matrix multiplication
|
||||
tmp[tid] += v0 * y[iybs + iqs + 0];
|
||||
tmp[tid] += v1 * y[iybs + iqs + y_offset];
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for (int s=block_size/2; s>0; s>>=1) {
|
||||
if (tid < s) {
|
||||
tmp[tid] += tmp[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (tid == 0) {
|
||||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
|
||||
*v0 = vload_half(0, (__global half*) &x[ib + 0]);
|
||||
*v1 = vload_half(0, (__global half*) &x[ib + 1]);
|
||||
}
|
||||
__kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
const int tid = get_local_id(0);
|
||||
|
||||
const uint qk = 32;
|
||||
const uint qr = 1;
|
||||
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
tmp[tid] = 0;
|
||||
|
||||
for (int i = 0; i < ncols/block_size; i += 2) {
|
||||
const int col = i*block_size + 2*tid;
|
||||
const int ib = (row*ncols + col)/qk; // block index
|
||||
const int iqs = (col%qk)/qr; // quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// convert
|
||||
float v0, v1;
|
||||
convert_f16(x, ib, iqs, &v0, &v1);
|
||||
|
||||
// matrix multiplication
|
||||
tmp[tid] += v0 * y[iybs + iqs + 0];
|
||||
|
@ -218,7 +484,7 @@ static cl_command_queue queue;
|
|||
static cl_program program;
|
||||
static cl_kernel convert_fp16_to_fp32_cl;
|
||||
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
||||
static cl_kernel dequantize_mul_mat_vec_cl;
|
||||
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
||||
static bool fp16_support;
|
||||
|
||||
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
||||
|
@ -312,7 +578,17 @@ void ggml_cl_init(void) {
|
|||
CL_CHECK(err, "clCreateKernel");
|
||||
|
||||
// dequant mul mat kernel
|
||||
dequantize_mul_mat_vec_cl = clCreateKernel(program, "dequantize_mul_mat_vec", &err);
|
||||
dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err);
|
||||
CL_CHECK(err, "clCreateKernel");
|
||||
}
|
||||
|
||||
|
@ -338,17 +614,17 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
|||
static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return &dequantize_mul_mat_vec_cl;
|
||||
// case GGML_TYPE_Q4_1:
|
||||
// return dequantize_mul_mat_vec_q4_1_cl;
|
||||
// case GGML_TYPE_Q5_0:
|
||||
// return dequantize_mul_mat_vec_q5_0_cl;
|
||||
// case GGML_TYPE_Q5_1:
|
||||
// return dequantize_mul_mat_vec_q5_1_cl;
|
||||
// case GGML_TYPE_Q8_0:
|
||||
// return dequantize_mul_mat_vec_q8_0_cl;
|
||||
// case GGML_TYPE_F16:
|
||||
// return convert_mul_mat_vec_f16_cl;
|
||||
return &dequantize_mul_mat_vec_q4_0_cl;
|
||||
case GGML_TYPE_Q4_1:
|
||||
return &dequantize_mul_mat_vec_q4_1_cl;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return &dequantize_mul_mat_vec_q5_0_cl;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return &dequantize_mul_mat_vec_q5_1_cl;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return &dequantize_mul_mat_vec_q8_0_cl;
|
||||
case GGML_TYPE_F16:
|
||||
return &convert_mul_mat_vec_f16_cl;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -646,6 +922,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
}
|
||||
|
||||
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
||||
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
||||
GGML_ASSERT(to_fp32_cl != nullptr);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
|
@ -669,13 +946,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
||||
const size_t local = CL_DMMV_BLOCK_SIZE;
|
||||
const cl_int ncols = ne00;
|
||||
CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 1, sizeof(float) * local, NULL), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 3, sizeof(cl_mem), &d_D), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 4, sizeof(cl_int), &ncols), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D), "clSetKernelArg");
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols), "clSetKernelArg");
|
||||
CL_CHECK(clFinish(queue), "clFinish");
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, dequantize_mul_mat_vec_cl, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel");
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel");
|
||||
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue