From 9b4186531246cf8716882755b5fcbad3ec635df3 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 10 Jun 2023 21:52:32 +0800 Subject: [PATCH 1/9] Porting q2_k kernel to OpenCL --- ggml-opencl.cpp | 188 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 5df922abd..bc4b8059c 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -59,6 +59,29 @@ struct __attribute__ ((packed)) block_q8_0 int8_t qs[QK8_0]; }; +struct __attribute__ ((packed)) block_q2_K +{ + uchar scales[16]; + uchar qs[64]; + half d; + half dmin; +}; + +struct __attribute__ ((packed)) block_q4_K +{ + uchar scales[12]; + uchar qs[128]; + half d; + half dmin; +}; + +struct __attribute__ ((packed)) block_q3_K +{ + uchar hmask[32]; + uchar qs[64]; + uchar scales[12]; + half d; +}; __kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { const uint i = get_global_id(0); @@ -131,8 +154,120 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } + +static inline void get_scale_min_k4(int j, const __global uchar *q, uchar *d, uchar *m) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j + 4] & 63; + } else { + *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); + } +} + +__kernel void dequantize_block_q2_K(__global const struct block_q2_K* x, __global float *yy) { + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int n = tid / 32; + const int l = tid - 32 * n; + const int is = 8 * n + l / 16; + + const uchar q = x[i].qs[32 * n + l]; + __global float *y = yy + i * 256 + 128 * n; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4); + y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4); + y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4); + y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4); +} + +__kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __global float *yy) { + int r = get_local_id(0) / 4; + int i = get_group_id(0); + int tid = r / 2; + int is0 = r % 2; + int l0 = 16 * is0 + 4 * (get_local_id(0) % 4); + int n = tid / 4; + int j = tid - 4 * n; + + uchar m = 1 << (4 * n + j); + int is = 8 * n + 2 * j + is0; + int shift = 2 * j; + + uchar us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) : + (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); + float d_all = vload_half(0, &x[i].d); + float dl = d_all * (us - 32); + + __global float *y = yy + i * 256 + 128 * n + 32 * j; + const __global uchar *q = x[i].qs + 32 * n; + const __global uchar *hm = x[i].hmask; + + for (int l = l0; l < l0 + 4; ++l) + y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l / 8] & m) ? 0 : 4)); +} + +__kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __global float *yy) { + + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int il = tid / 8; + const int ir = tid % 8; + const int is = 2 * il; + const int n = 4; + + __global float *y = yy + i * 256 + 64 * il + n * ir; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uchar *q = x[i].qs + 32 * il + n * ir; + + uchar sc, m; + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + float d1 = dall * sc; + float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + float d2 = dall * sc; + float m2 = dmin * m; + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l + 32] = d2 * (q[l] >> 4) - m2; + } +} + ); +// __kernel void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, __global float *result) { + +// int n = iqs / 128; // 0 or 1 +// int r = iqs - 128 * n; // 0...120 in steps of 8 +// int l = r / 8; // 0...15 in steps of 1 + +// __global const float *y = yy + 128 * n + l; +// __global const uchar *q = x[ib].qs + 32 * n + l; +// __global const uchar *s = x[ib].scales + 8 * n; + +// const float dall = vload_half(0, &x[ib].d); +// const float dmin = vload_half(0, &x[ib].dmin); + +// float sum = y[0] * (dall * ((s[0] & 0xF) * ((q[0] >> 0) & 3)) - dmin * (s[0] >> 4)) +// + y[32] * (dall * ((s[2] & 0xF) * ((q[0] >> 2) & 3)) - dmin * (s[2] >> 4)) +// + y[64] * (dall * ((s[4] & 0xF) * ((q[0] >> 4) & 3)) - dmin * (s[4] >> 4)) +// + y[96] * (dall * ((s[6] & 0xF) * ((q[0] >> 6) & 3)) - dmin * (s[6] >> 4)) +// + y[16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) +// + y[48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) +// + y[80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) +// + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); + +// *result = sum; +// } + std::string dequant_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; @@ -199,6 +334,45 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float } ); +// std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( +// __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { +// const int block_size = get_local_size(0); +// const int row = get_global_id(0) / block_size; +// const int tid = get_local_id(0); + +// const int iter_stride = 256; +// const int vals_per_iter = iter_stride; +// const int num_blocks_per_row = ncols / 256; +// const int ib0 = row*num_blocks_per_row; + +// tmp[tid] = 0; + +// for (int i = 0; i < ncols; i += iter_stride) { +// const int col = i + vals_per_iter*tid; +// const int ib = ib0 + col/QK_K; // x block index +// const int iqs = col%QK_K; // x quant index +// const int iybs = col - col%QK_K; // y block start index + +// // dequantize +// float v; +// dot_kernel(vx, ib, iqs, y + iybs, v); +// tmp += v; +// } + +// // sum up partial sums and write back result +// barrier(CLK_LOCAL_MEM_FENCE); +// for (int s=block_size/2; s>0; s>>=1) { +// if (tid < s) { +// tmp[tid] += tmp[tid + s]; +// } +// barrier(CLK_LOCAL_MEM_FENCE); +// } +// if (tid == 0) { +// dst[row] = tmp[0]; +// } +// } +// ); + std::string mul_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); @@ -300,6 +474,7 @@ static cl_program program; static cl_kernel convert_row_f16_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; +static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl; static cl_kernel mul_f32_cl; static bool fp16_support; @@ -529,6 +704,10 @@ void ggml_cl_init(void) { CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err)); CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err)); CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); + CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); + CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); + CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); + CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); // dequant mul mat kernel CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err)); @@ -538,6 +717,7 @@ void ggml_cl_init(void) { CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); + // mul kernel CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); } @@ -554,6 +734,12 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { return &dequantize_row_q5_1_cl; case GGML_TYPE_Q8_0: return &dequantize_row_q8_0_cl; + case GGML_TYPE_Q2_K: + return &dequantize_block_q2_k_cl; + case GGML_TYPE_Q3_K: + return &dequantize_block_q3_k_cl; + case GGML_TYPE_Q4_K: + return &dequantize_block_q4_k_cl; case GGML_TYPE_F16: return &convert_row_f16_cl; default: @@ -1017,6 +1203,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); GGML_ASSERT(to_fp32_cl != nullptr); + printf("\ntype:%d q_sz:%d y_sz:%d ne00:%d ne01:%d ne10:%d ne11:%d nb2:%d nb3:%d",type,q_size,y_size,ne00,ne01,ne10,ne11); + size_t ev_idx = 0; std::vector events; From 44422fd56773a2f66c24e5a8d726dfe4109f5adc Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 11 Jun 2023 12:47:21 +0200 Subject: [PATCH 2/9] Set global and local sizes for kernel calls for dequantizing k-quants --- ggml-opencl.cpp | 70 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index bc4b8059c..44d26d968 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -155,7 +155,7 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float *v1 = vload_half(0, &x[ib + 1]); } -static inline void get_scale_min_k4(int j, const __global uchar *q, uchar *d, uchar *m) { +inline void get_scale_min_k4(int j, const __global uchar *q, uchar *d, uchar *m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; @@ -747,6 +747,64 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { } } +static size_t ggml_cl_global_denom(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return 1; + case GGML_TYPE_Q4_1: + return 1; + case GGML_TYPE_Q5_0: + return 1; + case GGML_TYPE_Q5_1: + return 1; + case GGML_TYPE_Q8_0: + return 1; + case GGML_TYPE_Q2_K: + return 4; + case GGML_TYPE_Q3_K: + return 4; + case GGML_TYPE_Q4_K: + return 8; + case GGML_TYPE_Q5_K: + return 4; + case GGML_TYPE_Q6_K: + return 4; + case GGML_TYPE_F16: + return 1; + default: + return 1; + } +} + +static size_t ggml_cl_local_size(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return 0; + case GGML_TYPE_Q4_1: + return 0; + case GGML_TYPE_Q5_0: + return 0; + case GGML_TYPE_Q5_1: + return 0; + case GGML_TYPE_Q8_0: + return 0; + case GGML_TYPE_Q2_K: + return 64; + case GGML_TYPE_Q3_K: + return 64; + case GGML_TYPE_Q4_K: + return 32; + case GGML_TYPE_Q5_K: + return 64; + case GGML_TYPE_Q6_K: + return 64; + case GGML_TYPE_F16: + return 0; + default: + return 0; + } +} + static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: @@ -1199,11 +1257,15 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * d_Q = ggml_cl_pool_malloc(q_sz, &q_size); } + printf("\ntype:%d q_sz:%d y_sz:%d ne00:%d ne01:%d ne10:%d ne11:%d nb2:%d nb3:%d",type,q_size,y_size,ne00,ne01,ne10,ne11); + fflush(stdout); + cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); GGML_ASSERT(to_fp32_cl != nullptr); - printf("\ntype:%d q_sz:%d y_sz:%d ne00:%d ne01:%d ne10:%d ne11:%d nb2:%d nb3:%d",type,q_size,y_size,ne00,ne01,ne10,ne11); + const size_t global_denom = ggml_cl_global_denom(type); + const size_t local = ggml_cl_local_size(type); size_t ev_idx = 0; std::vector events; @@ -1237,10 +1299,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); } else { // general dequantization kernel + CLBlast matrix matrix multiplication // convert src0 to fp32 on device - const size_t global = x_ne; + const size_t global = x_ne / global_denom; CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); // copy src1 to device CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); From 1506affd0a96dca301b72230df11fdec6f2ea7e8 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 11 Jun 2023 22:29:43 +0800 Subject: [PATCH 3/9] Added q6_k kernel --- ggml-opencl.cpp | 81 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 44d26d968..81173c495 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -61,28 +61,36 @@ struct __attribute__ ((packed)) block_q8_0 struct __attribute__ ((packed)) block_q2_K { - uchar scales[16]; - uchar qs[64]; - half d; - half dmin; -}; - -struct __attribute__ ((packed)) block_q4_K -{ - uchar scales[12]; - uchar qs[128]; + uint8_t scales[16]; + uint8_t qs[64]; half d; half dmin; }; struct __attribute__ ((packed)) block_q3_K { - uchar hmask[32]; - uchar qs[64]; - uchar scales[12]; + uint8_t hmask[32]; + uint8_t qs[64]; + uint8_t scales[12]; half d; }; +struct __attribute__ ((packed)) block_q4_K +{ + uint8_t scales[12]; + uint8_t qs[128]; + half d; + half dmin; +}; + +struct __attribute__ ((packed)) block_q6_K +{ + uint8_t ql[128]; + uint8_t qh[64]; + int8_t scales[16]; + half d; +}; + __kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { const uint i = get_global_id(0); @@ -172,7 +180,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K* x, __globa const int l = tid - 32 * n; const int is = 8 * n + l / 16; - const uchar q = x[i].qs[32 * n + l]; + const uint8_t q = x[i].qs[32 * n + l]; __global float *y = yy + i * 256 + 128 * n; const float dall = vload_half(0, &x[i].d); @@ -193,11 +201,11 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __globa int n = tid / 4; int j = tid - 4 * n; - uchar m = 1 << (4 * n + j); + uint8_t m = 1 << (4 * n + j); int is = 8 * n + 2 * j + is0; int shift = 2 * j; - uchar us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) : + int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); @@ -205,11 +213,12 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __globa float dl = d_all * (us - 32); __global float *y = yy + i * 256 + 128 * n + 32 * j; - const __global uchar *q = x[i].qs + 32 * n; - const __global uchar *hm = x[i].hmask; + const __global uint8_t *q = x[i].qs + 32 * n; + const __global uint8_t *hm = x[i].hmask; for (int l = l0; l < l0 + 4; ++l) - y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l / 8] & m) ? 0 : 4)); + y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); + } __kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __global float *yy) { @@ -226,9 +235,9 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __globa const float dall = vload_half(0, &x[i].d); const float dmin = vload_half(0, &x[i].dmin); - __global const uchar *q = x[i].qs + 32 * il + n * ir; + __global const uint8_t *q = x[i].qs + 32 * il + n * ir; - uchar sc, m; + uint8_t sc, m; get_scale_min_k4(is + 0, x[i].scales, &sc, &m); float d1 = dall * sc; float m1 = dmin * m; @@ -241,6 +250,28 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __globa } } +__kernel void dequantize_block_q6_K(__global const struct block_q6_K* x, __global float *yy) { + + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int ip = tid / 32; + const int il = tid - 32 * ip; + const int is = 8 * ip + il / 16; + + __global float* y = yy + i * 256 + 128 * ip + il; + + const float d = vload_half(0, &x[i].d); + + __global const uint8_t * ql = x[i].ql + 64 * ip + il; + const uint8_t qh = x[i].qh[32 * ip + il]; + __global const int8_t * sc = x[i].scales + is; + + y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +} + ); // __kernel void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, __global float *result) { @@ -474,7 +505,7 @@ static cl_program program; static cl_kernel convert_row_f16_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; -static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl; +static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q6_k_cl; static cl_kernel mul_f32_cl; static bool fp16_support; @@ -708,6 +739,7 @@ void ggml_cl_init(void) { CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); + CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); // dequant mul mat kernel CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err)); @@ -740,6 +772,8 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { return &dequantize_block_q3_k_cl; case GGML_TYPE_Q4_K: return &dequantize_block_q4_k_cl; + case GGML_TYPE_Q6_K: + return &dequantize_block_q6_k_cl; case GGML_TYPE_F16: return &convert_row_f16_cl; default: @@ -1257,9 +1291,6 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * d_Q = ggml_cl_pool_malloc(q_sz, &q_size); } - printf("\ntype:%d q_sz:%d y_sz:%d ne00:%d ne01:%d ne10:%d ne11:%d nb2:%d nb3:%d",type,q_size,y_size,ne00,ne01,ne10,ne11); - fflush(stdout); - cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); GGML_ASSERT(to_fp32_cl != nullptr); From a4ee2b89d206cca5bb70f4e1be38e5ada5d617fe Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 12 Jun 2023 08:13:05 +0200 Subject: [PATCH 4/9] Fix q4_k opencl struct order --- ggml-opencl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 81173c495..79e63c41f 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -77,10 +77,10 @@ struct __attribute__ ((packed)) block_q3_K struct __attribute__ ((packed)) block_q4_K { - uint8_t scales[12]; - uint8_t qs[128]; half d; half dmin; + uint8_t scales[12]; + uint8_t qs[128]; }; struct __attribute__ ((packed)) block_q6_K From 56151bb875c225679a345f8109277f26e672db84 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 12 Jun 2023 14:20:44 +0800 Subject: [PATCH 5/9] Replace uchar with uint8_t --- ggml-opencl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 79e63c41f..35b385866 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -163,7 +163,7 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float *v1 = vload_half(0, &x[ib + 1]); } -inline void get_scale_min_k4(int j, const __global uchar *q, uchar *d, uchar *m) { +inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; From f558e4c2978fb555d2adac77837d886337db2e36 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 12 Jun 2023 14:55:21 +0800 Subject: [PATCH 6/9] Finish dequant kernels --- ggml-opencl.cpp | 360 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 245 insertions(+), 115 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 35b385866..0fca0aebe 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -25,27 +25,27 @@ typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; -struct __attribute__ ((packed)) block_q4_0 +struct __attribute__((packed)) block_q4_0 { half d; uint8_t qs[QK4_0 / 2]; }; -struct __attribute__ ((packed)) block_q4_1 +struct __attribute__((packed)) block_q4_1 { half d; half m; uint8_t qs[QK4_1 / 2]; }; -struct __attribute__ ((packed)) block_q5_0 +struct __attribute__((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; -struct __attribute__ ((packed)) block_q5_1 +struct __attribute__((packed)) block_q5_1 { half d; half m; @@ -53,13 +53,13 @@ struct __attribute__ ((packed)) block_q5_1 uint8_t qs[QK5_1 / 2]; }; -struct __attribute__ ((packed)) block_q8_0 +struct __attribute__((packed)) block_q8_0 { half d; int8_t qs[QK8_0]; }; -struct __attribute__ ((packed)) block_q2_K +struct __attribute__((packed)) block_q2_K { uint8_t scales[16]; uint8_t qs[64]; @@ -67,7 +67,7 @@ struct __attribute__ ((packed)) block_q2_K half dmin; }; -struct __attribute__ ((packed)) block_q3_K +struct __attribute__((packed)) block_q3_K { uint8_t hmask[32]; uint8_t qs[64]; @@ -75,7 +75,7 @@ struct __attribute__ ((packed)) block_q3_K half d; }; -struct __attribute__ ((packed)) block_q4_K +struct __attribute__((packed)) block_q4_K { half d; half dmin; @@ -83,21 +83,32 @@ struct __attribute__ ((packed)) block_q4_K uint8_t qs[128]; }; -struct __attribute__ ((packed)) block_q6_K +struct __attribute__((packed)) block_q5_K +{ + half d; + half dmin; + uint8_t scales[12]; + uint8_t qh[32]; + uint8_t qs[128]; +}; + +struct __attribute__((packed)) block_q6_K { uint8_t ql[128]; uint8_t qh[64]; - int8_t scales[16]; - half d; + int8_t scales[16]; + half d; }; -__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { +__kernel void convert_fp16_to_fp32(__global half *x, __global float *y) +{ const uint i = get_global_id(0); y[i] = vload_half(0, &x[i]); } -void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { +void dequantize_q4_0(__global const struct block_q4_0 *x, const int ib, const int iqs, float *v0, float *v1) +{ const float d = vload_half(0, &x[ib].d); const uint8_t vui = x[ib].qs[iqs]; @@ -105,10 +116,10 @@ void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const in const int8_t vi0 = vui & 0xF; const int8_t vi1 = vui >> 4; - *v0 = (vi0 - 8)*d; - *v1 = (vi1 - 8)*d; -} -void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { + *v0 = (vi0 - 8) * d; + *v1 = (vi1 - 8) * d; +} void dequantize_q4_1(__global const struct block_q4_1 *x, const int ib, const int iqs, float *v0, float *v1) +{ const float d = vload_half(0, &x[ib].d); const float m = vload_half(0, &x[ib].m); @@ -117,63 +128,68 @@ void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const in const int8_t vi0 = vui & 0xF; const int8_t vi1 = vui >> 4; - *v0 = vi0*d + m; - *v1 = vi1*d + m; -} -void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { + *v0 = vi0 * d + m; + *v1 = vi1 * d + m; +} void dequantize_q5_0(__global const struct block_q5_0 *x, const int ib, const int iqs, float *v0, float *v1) +{ const float d = vload_half(0, &x[ib].d); uint32_t qh = x[ib].qh; - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12))) & 0x10; const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; - *v0 = x0*d; - *v1 = x1*d; -} -void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { + *v0 = x0 * d; + *v1 = x1 * d; +} void dequantize_q5_1(__global const struct block_q5_1 *x, const int ib, const int iqs, float *v0, float *v1) +{ const float d = vload_half(0, &x[ib].d); const float m = vload_half(0, &x[ib].m); uint32_t qh = x[ib].qh; - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12))) & 0x10; const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); - *v0 = x0*d + m; - *v1 = x1*d + m; -} -void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { + *v0 = x0 * d + m; + *v1 = x1 * d + m; +} void dequantize_q8_0(__global const struct block_q8_0 *x, const int ib, const int iqs, float *v0, float *v1) +{ const float d = vload_half(0, &x[ib].d); const int8_t vi0 = x[ib].qs[iqs + 0]; const int8_t vi1 = x[ib].qs[iqs + 1]; - *v0 = vi0*d; - *v1 = vi1*d; -} -void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ + *v0 = vi0 * d; + *v1 = vi1 * d; +} void convert_f16(__global half *x, const int ib, const int iqs, float *v0, float *v1) +{ *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } -inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) { - if (j < 4) { +inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) +{ + if (j < 4) + { *d = q[j] & 63; *m = q[j + 4] & 63; - } else { + } + else + { *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); } } -__kernel void dequantize_block_q2_K(__global const struct block_q2_K* x, __global float *yy) { +__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy) +{ const int i = get_group_id(0); const int tid = get_local_id(0); const int n = tid / 32; @@ -192,7 +208,8 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K* x, __globa y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4); } -__kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __global float *yy) { +__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy) +{ int r = get_local_id(0) / 4; int i = get_group_id(0); int tid = r / 2; @@ -205,10 +222,9 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __globa int is = 8 * n + 2 * j + is0; int shift = 2 * j; - int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) : - is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) : - is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) : - (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); + int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) + : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) + : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4); float d_all = vload_half(0, &x[i].d); float dl = d_all * (us - 32); @@ -218,11 +234,10 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __globa for (int l = l0; l < l0 + 4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); - } -__kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __global float *yy) { - +__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy) +{ const int i = get_group_id(0); const int tid = get_local_id(0); const int il = tid / 8; @@ -244,27 +259,60 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __globa get_scale_min_k4(is + 1, x[i].scales, &sc, &m); float d2 = dall * sc; float m2 = dmin * m; - for (int l = 0; l < n; ++l) { + for (int l = 0; l < n; ++l) + { y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l + 32] = d2 * (q[l] >> 4) - m2; } } -__kernel void dequantize_block_q6_K(__global const struct block_q6_K* x, __global float *yy) { +__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy) +{ + const int i = get_group_id(0); + const int tid = get_local_id(0); + const int il = tid / 16; + const int ir = tid % 16; + const int is = 2 * il; + __global float *y = yy + i * 256 + 64 * il + 2 * ir; + + const float dall = vload_half(0, &x[i].d); + const float dmin = vload_half(0, &x[i].dmin); + + __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir; + __global const uint8_t *qh = x[i].qh + 2 * ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, &sc, &m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, &sc, &m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + uint8_t hm = 1 << (2 * il); + y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1; + y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2; +} + +__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy) +{ const int i = get_group_id(0); const int tid = get_local_id(0); const int ip = tid / 32; const int il = tid - 32 * ip; const int is = 8 * ip + il / 16; - __global float* y = yy + i * 256 + 128 * ip + il; + __global float *y = yy + i * 256 + 128 * ip + il; const float d = vload_half(0, &x[i].d); - __global const uint8_t * ql = x[i].ql + 64 * ip + il; + __global const uint8_t *ql = x[i].ql + 64 * ip + il; const uint8_t qh = x[i].qh[32 * ip + il]; - __global const int8_t * sc = x[i].scales + is; + __global const int8_t *sc = x[i].scales + is; y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); @@ -272,32 +320,85 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K* x, __globa y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } + +void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + int n = iqs / 128; + int r = iqs - 128 * n; + int l = r / 8; + + __global const float *y = yy + 128 * n + l; + __global const uint8_t *q = x[ib].qs + 32 * n + l; + __global const uint8_t *s = x[ib].scales + 8 * n; + + const float dall = vload_half(0, &x[ib].d); + const float dmin = vload_half(0, &x[ib].dmin); + + float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4)) + + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4)) + + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4)) + + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4)) + + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) + + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) + + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) + + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); + + *result = sum; +} + +void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + const uint32_t kmask1 = 0x03030303; + const uint32_t kmask2 = 0x0f0f0f0f; + + uint32_t aux[3]; + uint32_t utmp[4]; + + int n = iqs/128; + int r = iqs - 128*n; + int l = r/8; + + __global const float * y = yy + 128*n + l; + __global const uint8_t * q = x[ib].qs + 32*n + l; + __global const uint8_t * hm = x[ib].hmask + l; + const int8_t * s = (const int8_t *)utmp + 8*n; + + aux[0] |= x[ib].scales[0]; + aux[0] |= x[ib].scales[1] << 8; + aux[0] |= x[ib].scales[2] << 16; + aux[0] |= x[ib].scales[3] << 24; + aux[1] |= x[ib].scales[4]; + aux[1] |= x[ib].scales[5] << 8; + aux[1] |= x[ib].scales[6] << 16; + aux[1] |= x[ib].scales[7] << 24; + aux[2] |= x[ib].scales[8]; + aux[2] |= x[ib].scales[9] << 8; + aux[2] |= x[ib].scales[10] << 16; + aux[2] |= x[ib].scales[11] << 24; + + utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); + utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); + utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); + utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); + + const float dall = vload_half(0, &x[ib].d); + const uint8_t m = 1 << (4*n); + + float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4)) + + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4)) + + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4)) + + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4)) + + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4)) + + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4)) + + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4)) + + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4)); + + *result = sum * dall; + +} + ); -// __kernel void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, __global float *result) { - -// int n = iqs / 128; // 0 or 1 -// int r = iqs - 128 * n; // 0...120 in steps of 8 -// int l = r / 8; // 0...15 in steps of 1 - -// __global const float *y = yy + 128 * n + l; -// __global const uchar *q = x[ib].qs + 32 * n + l; -// __global const uchar *s = x[ib].scales + 8 * n; - -// const float dall = vload_half(0, &x[ib].d); -// const float dmin = vload_half(0, &x[ib].dmin); - -// float sum = y[0] * (dall * ((s[0] & 0xF) * ((q[0] >> 0) & 3)) - dmin * (s[0] >> 4)) -// + y[32] * (dall * ((s[2] & 0xF) * ((q[0] >> 2) & 3)) - dmin * (s[2] >> 4)) -// + y[64] * (dall * ((s[4] & 0xF) * ((q[0] >> 4) & 3)) - dmin * (s[4] >> 4)) -// + y[96] * (dall * ((s[6] & 0xF) * ((q[0] >> 6) & 3)) - dmin * (s[6] >> 4)) -// + y[16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) -// + y[48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) -// + y[80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) -// + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); - -// *result = sum; -// } std::string dequant_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { @@ -365,44 +466,44 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float } ); -// std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( -// __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { -// const int block_size = get_local_size(0); -// const int row = get_global_id(0) / block_size; -// const int tid = get_local_id(0); +std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( +__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { + const int block_size = get_local_size(0); + const int row = get_global_id(0) / block_size; + const int tid = get_local_id(0); -// const int iter_stride = 256; -// const int vals_per_iter = iter_stride; -// const int num_blocks_per_row = ncols / 256; -// const int ib0 = row*num_blocks_per_row; + const int iter_stride = 256; + const int vals_per_iter = iter_stride; + const int num_blocks_per_row = ncols / 256; + const int ib0 = row*num_blocks_per_row; -// tmp[tid] = 0; + tmp[tid] = 0; -// for (int i = 0; i < ncols; i += iter_stride) { -// const int col = i + vals_per_iter*tid; -// const int ib = ib0 + col/QK_K; // x block index -// const int iqs = col%QK_K; // x quant index -// const int iybs = col - col%QK_K; // y block start index + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = ib0 + col/256; // x block index + const int iqs = col%256; // x quant index + const int iybs = col - col%256; // y block start index -// // dequantize -// float v; -// dot_kernel(vx, ib, iqs, y + iybs, v); -// tmp += v; -// } + // dequantize + float v; + dot_kernel(x, ib, iqs, y + iybs, &v); + tmp[tid] += v; + } -// // sum up partial sums and write back result -// barrier(CLK_LOCAL_MEM_FENCE); -// for (int s=block_size/2; s>0; s>>=1) { -// if (tid < s) { -// tmp[tid] += tmp[tid + s]; -// } -// barrier(CLK_LOCAL_MEM_FENCE); -// } -// if (tid == 0) { -// dst[row] = tmp[0]; -// } -// } -// ); + // sum up partial sums and write back result + barrier(CLK_LOCAL_MEM_FENCE); + for (int s=block_size/2; s>0; s>>=1) { + if (tid < s) { + tmp[tid] += tmp[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + dst[row] = tmp[0]; + } +} +); std::string mul_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { @@ -465,6 +566,11 @@ std::array mul_str_values = { "mul_f32", "float" }; +std::array dmmv_k_str_values = { + "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K", + "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K", +}; + std::string& replace(std::string& s, const std::string& from, const std::string& to) { size_t pos = 0; while ((pos = s.find(from, pos)) != std::string::npos) { @@ -494,6 +600,15 @@ std::string generate_kernels() { } src << mul_kernel << '\n'; } + for (size_t i = 0; i < dmmv_k_str_values.size(); i += 3) { + std::string dmmv_kernel = dequant_mul_mat_vec_k_template; + //just apply quick template fn name replacement for the K quant DMMVs since sizes are known + replace(dmmv_kernel, "KERNEL_NAME", dmmv_k_str_values[i]); + replace(dmmv_kernel, "X_TYPE", dmmv_k_str_values[i + 1]); + replace(dmmv_kernel, "dot_kernel", dmmv_k_str_values[i + 2]); + src << dmmv_kernel << '\n'; + } + return src.str(); } @@ -505,7 +620,8 @@ static cl_program program; static cl_kernel convert_row_f16_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; -static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q6_k_cl; +static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; +static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; static cl_kernel mul_f32_cl; static bool fp16_support; @@ -739,6 +855,7 @@ void ggml_cl_init(void) { CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); + CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err)); CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); // dequant mul mat kernel @@ -748,7 +865,8 @@ void ggml_cl_init(void) { CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err)); CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); - + CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err)); // mul kernel CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); @@ -772,6 +890,8 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { return &dequantize_block_q3_k_cl; case GGML_TYPE_Q4_K: return &dequantize_block_q4_k_cl; + case GGML_TYPE_Q5_K: + return &dequantize_block_q5_k_cl; case GGML_TYPE_Q6_K: return &dequantize_block_q6_k_cl; case GGML_TYPE_F16: @@ -853,6 +973,16 @@ static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { return &dequantize_mul_mat_vec_q8_0_cl; case GGML_TYPE_F16: return &convert_mul_mat_vec_f16_cl; + case GGML_TYPE_Q2_K: + return &dequantize_mul_mat_vec_q2_K_cl; + case GGML_TYPE_Q3_K: + return &dequantize_mul_mat_vec_q3_K_cl; + case GGML_TYPE_Q4_K: + return &dequantize_mul_mat_vec_q4_K_cl; + case GGML_TYPE_Q5_K: + return &dequantize_mul_mat_vec_q5_K_cl; + case GGML_TYPE_Q6_K: + return &dequantize_mul_mat_vec_q6_K_cl; default: return nullptr; } From 6e20827f933657fedeb63ad99ff069ce4ee814b8 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 12 Jun 2023 19:31:09 +0800 Subject: [PATCH 7/9] Added OpenCL DMMV kernels --- ggml-opencl.cpp | 165 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 129 insertions(+), 36 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 0fca0aebe..042855402 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -25,27 +25,27 @@ typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; -struct __attribute__((packed)) block_q4_0 +struct __attribute__ ((packed)) block_q4_0 { half d; uint8_t qs[QK4_0 / 2]; }; -struct __attribute__((packed)) block_q4_1 +struct __attribute__ ((packed)) block_q4_1 { half d; half m; uint8_t qs[QK4_1 / 2]; }; -struct __attribute__((packed)) block_q5_0 +struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; uint8_t qs[QK5_0 / 2]; }; -struct __attribute__((packed)) block_q5_1 +struct __attribute__ ((packed)) block_q5_1 { half d; half m; @@ -53,7 +53,7 @@ struct __attribute__((packed)) block_q5_1 uint8_t qs[QK5_1 / 2]; }; -struct __attribute__((packed)) block_q8_0 +struct __attribute__ ((packed)) block_q8_0 { half d; int8_t qs[QK8_0]; @@ -100,15 +100,13 @@ struct __attribute__((packed)) block_q6_K half d; }; -__kernel void convert_fp16_to_fp32(__global half *x, __global float *y) -{ +__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { const uint i = get_global_id(0); y[i] = vload_half(0, &x[i]); } -void dequantize_q4_0(__global const struct block_q4_0 *x, const int ib, const int iqs, float *v0, float *v1) -{ +void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { const float d = vload_half(0, &x[ib].d); const uint8_t vui = x[ib].qs[iqs]; @@ -116,10 +114,10 @@ void dequantize_q4_0(__global const struct block_q4_0 *x, const int ib, const in const int8_t vi0 = vui & 0xF; const int8_t vi1 = vui >> 4; - *v0 = (vi0 - 8) * d; - *v1 = (vi1 - 8) * d; -} void dequantize_q4_1(__global const struct block_q4_1 *x, const int ib, const int iqs, float *v0, float *v1) -{ + *v0 = (vi0 - 8)*d; + *v1 = (vi1 - 8)*d; +} +void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { const float d = vload_half(0, &x[ib].d); const float m = vload_half(0, &x[ib].m); @@ -128,48 +126,48 @@ void dequantize_q4_0(__global const struct block_q4_0 *x, const int ib, const in const int8_t vi0 = vui & 0xF; const int8_t vi1 = vui >> 4; - *v0 = vi0 * d + m; - *v1 = vi1 * d + m; -} void dequantize_q5_0(__global const struct block_q5_0 *x, const int ib, const int iqs, float *v0, float *v1) -{ + *v0 = vi0*d + m; + *v1 = vi1*d + m; +} +void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { const float d = vload_half(0, &x[ib].d); uint32_t qh = x[ib].qh; - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12))) & 0x10; + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; - *v0 = x0 * d; - *v1 = x1 * d; -} void dequantize_q5_1(__global const struct block_q5_1 *x, const int ib, const int iqs, float *v0, float *v1) -{ + *v0 = x0*d; + *v1 = x1*d; +} +void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { const float d = vload_half(0, &x[ib].d); const float m = vload_half(0, &x[ib].m); uint32_t qh = x[ib].qh; - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12))) & 0x10; + const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); + const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); - *v0 = x0 * d + m; - *v1 = x1 * d + m; -} void dequantize_q8_0(__global const struct block_q8_0 *x, const int ib, const int iqs, float *v0, float *v1) -{ + *v0 = x0*d + m; + *v1 = x1*d + m; +} +void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { const float d = vload_half(0, &x[ib].d); const int8_t vi0 = x[ib].qs[iqs + 0]; const int8_t vi1 = x[ib].qs[iqs + 1]; - *v0 = vi0 * d; - *v1 = vi1 * d; -} void convert_f16(__global half *x, const int ib, const int iqs, float *v0, float *v1) -{ + *v0 = vi0*d; + *v1 = vi1*d; +} +void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } @@ -397,6 +395,95 @@ void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int i } +void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + const int j = iqs / 64; // j is in 0...3 + const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 + const int is = 2*j; // is is in 0...6 in steps of 2 + + __global const float * y = yy + 64*j + ir; + __global const uint8_t * q = x[ib].qs + 32*j + ir; + + const float dall = vload_half(0, &x[ib].d); + const float dmin = vload_half(0, &x[ib].dmin); + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[ib].scales, &sc, &m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[ib].scales, &sc, &m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + float sum = 0; + for (int k = 0; k < 4; ++k) { + sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1); + sum += y[k + 32] * (d2 * (q[k] >> 4) - m2); + } + + *result = sum; +} + +void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + const int j = iqs / 64; + const int ir = (iqs - 64*j)/2; + const int is = 2*j; + + __global const float * y = yy + 64*j + ir; + __global const uint8_t * ql = x[ib].qs + 32*j + ir; + __global const uint8_t * qh = x[ib].qh + ir; + + const float dall = vload_half(0, &x[ib].d); + const float dmin = vload_half(0, &x[ib].dmin); + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[ib].scales, &sc, &m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[ib].scales, &sc, &m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + uint8_t hm = 1 << is; + float sum = 0; + for (int k = 0; k < 4; ++k) { + sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1); + } + hm <<= 1; + for (int k = 0; k < 4; ++k) { + sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2); + } + *result = sum; + +} + +void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) { + + + const int ip = iqs / 128; // 0 or 1 + const int il = (iqs - 128*ip)/8; // 0...15 + const int is = 8*ip; + + __global const float * y = yy + 128*ip + il; + + const float d = vload_half(0, &x[ib].d); + + __global const uint8_t * ql = x[ib].ql + 64*ip + il; + const uint8_t * qh = x[ib].qh + 32*ip + il; + __global const int8_t * sc = x[ib].scales + is; + + *result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32) + + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32) + + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32) + + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32) + + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32) + + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32) + + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32) + + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32); + +} + ); @@ -566,9 +653,12 @@ std::array mul_str_values = { "mul_f32", "float" }; -std::array dmmv_k_str_values = { +std::array dmmv_k_str_values = { "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K", "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K", + "dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K", + "dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K", + "dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K", }; std::string& replace(std::string& s, const std::string& from, const std::string& to) { @@ -867,6 +957,9 @@ void ggml_cl_init(void) { CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err)); CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err)); + CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err)); // mul kernel CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); From fc8c823f34a3fb304d614c9064378f8521392c22 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 12 Jun 2023 20:02:56 +0200 Subject: [PATCH 8/9] Fix q2_k, improve code --- ggml-opencl.cpp | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 042855402..2691d7389 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -15,7 +15,7 @@ #include "ggml.h" -#define CL_DMMV_BLOCK_SIZE 32; +#define CL_DMMV_BLOCK_SIZE 32 #define MULTILINE_QUOTE(...) #__VA_ARGS__ static std::string program_source = MULTILINE_QUOTE( @@ -349,7 +349,6 @@ void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int i const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - uint32_t aux[3]; uint32_t utmp[4]; int n = iqs/128; @@ -361,18 +360,7 @@ void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int i __global const uint8_t * hm = x[ib].hmask + l; const int8_t * s = (const int8_t *)utmp + 8*n; - aux[0] |= x[ib].scales[0]; - aux[0] |= x[ib].scales[1] << 8; - aux[0] |= x[ib].scales[2] << 16; - aux[0] |= x[ib].scales[3] << 24; - aux[1] |= x[ib].scales[4]; - aux[1] |= x[ib].scales[5] << 8; - aux[1] |= x[ib].scales[6] << 16; - aux[1] |= x[ib].scales[7] << 24; - aux[2] |= x[ib].scales[8]; - aux[2] |= x[ib].scales[9] << 8; - aux[2] |= x[ib].scales[10] << 16; - aux[2] |= x[ib].scales[11] << 24; + __global const uint32_t* aux = (__global const uint32_t*) x[ib].scales; utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); @@ -470,7 +458,7 @@ void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int i const float d = vload_half(0, &x[ib].d); __global const uint8_t * ql = x[ib].ql + 64*ip + il; - const uint8_t * qh = x[ib].qh + 32*ip + il; + __global const uint8_t * qh = x[ib].qh + 32*ip + il; __global const int8_t * sc = x[ib].scales + is; *result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32) @@ -514,7 +502,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); - const int row = get_global_id(0) / block_size; + const int row = get_group_id(0); const int tid = get_local_id(0); const uint qk = QUANT_K; @@ -556,11 +544,11 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); - const int row = get_global_id(0) / block_size; + const int row = get_group_id(0); const int tid = get_local_id(0); const int iter_stride = 256; - const int vals_per_iter = iter_stride; + const int vals_per_iter = iter_stride / block_size; const int num_blocks_per_row = ncols / 256; const int ib0 = row*num_blocks_per_row; @@ -574,7 +562,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float // dequantize float v; - dot_kernel(x, ib, iqs, y + iybs, &v); + DOT_KERNEL(x, ib, iqs, y + iybs, &v); tmp[tid] += v; } @@ -653,6 +641,10 @@ std::array mul_str_values = { "mul_f32", "float" }; +std::array dmmv_k_str_keys = { + "KERNEL_NAME", "X_TYPE", "DOT_KERNEL" +}; + std::array dmmv_k_str_values = { "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K", "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K", @@ -690,13 +682,12 @@ std::string generate_kernels() { } src << mul_kernel << '\n'; } - for (size_t i = 0; i < dmmv_k_str_values.size(); i += 3) { - std::string dmmv_kernel = dequant_mul_mat_vec_k_template; - //just apply quick template fn name replacement for the K quant DMMVs since sizes are known - replace(dmmv_kernel, "KERNEL_NAME", dmmv_k_str_values[i]); - replace(dmmv_kernel, "X_TYPE", dmmv_k_str_values[i + 1]); - replace(dmmv_kernel, "dot_kernel", dmmv_k_str_values[i + 2]); - src << dmmv_kernel << '\n'; + for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) { + std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template; + for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) { + replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]); + } + src << dmmv_k_kernel << '\n'; } return src.str(); From 2a972f36499d27a6d040e8c61c008cdbca3ed764 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 13 Jun 2023 08:25:32 +0200 Subject: [PATCH 9/9] Fix q3_k --- ggml-opencl.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 2691d7389..57495b684 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -349,6 +349,7 @@ void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int i const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; + uint32_t aux[3]; uint32_t utmp[4]; int n = iqs/128; @@ -360,7 +361,9 @@ void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int i __global const uint8_t * hm = x[ib].hmask + l; const int8_t * s = (const int8_t *)utmp + 8*n; - __global const uint32_t* aux = (__global const uint32_t*) x[ib].scales; + aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24; + aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24; + aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24; utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);