Finish dequant kernels

This commit is contained in:
Concedo 2023-06-12 14:55:21 +08:00 committed by 0cc4m
parent 56151bb875
commit f558e4c297

View file

@ -83,6 +83,15 @@ struct __attribute__ ((packed)) block_q4_K
uint8_t qs[128]; uint8_t qs[128];
}; };
struct __attribute__((packed)) block_q5_K
{
half d;
half dmin;
uint8_t scales[12];
uint8_t qh[32];
uint8_t qs[128];
};
struct __attribute__((packed)) block_q6_K struct __attribute__((packed)) block_q6_K
{ {
uint8_t ql[128]; uint8_t ql[128];
@ -91,13 +100,15 @@ struct __attribute__ ((packed)) block_q6_K
half d; half d;
}; };
__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { __kernel void convert_fp16_to_fp32(__global half *x, __global float *y)
{
const uint i = get_global_id(0); const uint i = get_global_id(0);
y[i] = vload_half(0, &x[i]); y[i] = vload_half(0, &x[i]);
} }
void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { void dequantize_q4_0(__global const struct block_q4_0 *x, const int ib, const int iqs, float *v0, float *v1)
{
const float d = vload_half(0, &x[ib].d); const float d = vload_half(0, &x[ib].d);
const uint8_t vui = x[ib].qs[iqs]; const uint8_t vui = x[ib].qs[iqs];
@ -107,8 +118,8 @@ void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const in
*v0 = (vi0 - 8) * d; *v0 = (vi0 - 8) * d;
*v1 = (vi1 - 8) * d; *v1 = (vi1 - 8) * d;
} } void dequantize_q4_1(__global const struct block_q4_1 *x, const int ib, const int iqs, float *v0, float *v1)
void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { {
const float d = vload_half(0, &x[ib].d); const float d = vload_half(0, &x[ib].d);
const float m = vload_half(0, &x[ib].m); const float m = vload_half(0, &x[ib].m);
@ -119,8 +130,8 @@ void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const in
*v0 = vi0 * d + m; *v0 = vi0 * d + m;
*v1 = vi1 * d + m; *v1 = vi1 * d + m;
} } void dequantize_q5_0(__global const struct block_q5_0 *x, const int ib, const int iqs, float *v0, float *v1)
void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { {
const float d = vload_half(0, &x[ib].d); const float d = vload_half(0, &x[ib].d);
uint32_t qh = x[ib].qh; uint32_t qh = x[ib].qh;
@ -133,8 +144,8 @@ void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const in
*v0 = x0 * d; *v0 = x0 * d;
*v1 = x1 * d; *v1 = x1 * d;
} } void dequantize_q5_1(__global const struct block_q5_1 *x, const int ib, const int iqs, float *v0, float *v1)
void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { {
const float d = vload_half(0, &x[ib].d); const float d = vload_half(0, &x[ib].d);
const float m = vload_half(0, &x[ib].m); const float m = vload_half(0, &x[ib].m);
@ -148,8 +159,8 @@ void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const in
*v0 = x0 * d + m; *v0 = x0 * d + m;
*v1 = x1 * d + m; *v1 = x1 * d + m;
} } void dequantize_q8_0(__global const struct block_q8_0 *x, const int ib, const int iqs, float *v0, float *v1)
void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { {
const float d = vload_half(0, &x[ib].d); const float d = vload_half(0, &x[ib].d);
const int8_t vi0 = x[ib].qs[iqs + 0]; const int8_t vi0 = x[ib].qs[iqs + 0];
@ -157,23 +168,28 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
*v0 = vi0 * d; *v0 = vi0 * d;
*v1 = vi1 * d; *v1 = vi1 * d;
} } void convert_f16(__global half *x, const int ib, const int iqs, float *v0, float *v1)
void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ {
*v0 = vload_half(0, &x[ib + 0]); *v0 = vload_half(0, &x[ib + 0]);
*v1 = vload_half(0, &x[ib + 1]); *v1 = vload_half(0, &x[ib + 1]);
} }
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) { inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
if (j < 4) { {
if (j < 4)
{
*d = q[j] & 63; *d = q[j] & 63;
*m = q[j + 4] & 63; *m = q[j + 4] & 63;
} else { }
else
{
*d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
*m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
} }
} }
__kernel void dequantize_block_q2_K(__global const struct block_q2_K* x, __global float *yy) { __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
{
const int i = get_group_id(0); const int i = get_group_id(0);
const int tid = get_local_id(0); const int tid = get_local_id(0);
const int n = tid / 32; const int n = tid / 32;
@ -192,7 +208,8 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K* x, __globa
y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4); y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
} }
__kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __global float *yy) { __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
{
int r = get_local_id(0) / 4; int r = get_local_id(0) / 4;
int i = get_group_id(0); int i = get_group_id(0);
int tid = r / 2; int tid = r / 2;
@ -205,10 +222,9 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __globa
int is = 8 * n + 2 * j + is0; int is = 8 * n + 2 * j + is0;
int shift = 2 * j; int shift = 2 * j;
int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) : int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4) : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4) : : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4) : : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
(x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
float d_all = vload_half(0, &x[i].d); float d_all = vload_half(0, &x[i].d);
float dl = d_all * (us - 32); float dl = d_all * (us - 32);
@ -218,11 +234,10 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K* x, __globa
for (int l = l0; l < l0 + 4; ++l) for (int l = l0; l < l0 + 4; ++l)
y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
} }
__kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __global float *yy) { __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
{
const int i = get_group_id(0); const int i = get_group_id(0);
const int tid = get_local_id(0); const int tid = get_local_id(0);
const int il = tid / 8; const int il = tid / 8;
@ -244,14 +259,47 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K* x, __globa
get_scale_min_k4(is + 1, x[i].scales, &sc, &m); get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
float d2 = dall * sc; float d2 = dall * sc;
float m2 = dmin * m; float m2 = dmin * m;
for (int l = 0; l < n; ++l) { for (int l = 0; l < n; ++l)
{
y[l + 0] = d1 * (q[l] & 0xF) - m1; y[l + 0] = d1 * (q[l] & 0xF) - m1;
y[l + 32] = d2 * (q[l] >> 4) - m2; y[l + 32] = d2 * (q[l] >> 4) - m2;
} }
} }
__kernel void dequantize_block_q6_K(__global const struct block_q6_K* x, __global float *yy) { __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
{
const int i = get_group_id(0);
const int tid = get_local_id(0);
const int il = tid / 16;
const int ir = tid % 16;
const int is = 2 * il;
__global float *y = yy + i * 256 + 64 * il + 2 * ir;
const float dall = vload_half(0, &x[i].d);
const float dmin = vload_half(0, &x[i].dmin);
__global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
__global const uint8_t *qh = x[i].qh + 2 * ir;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
const float d2 = dall * sc;
const float m2 = dmin * m;
uint8_t hm = 1 << (2 * il);
y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
hm <<= 1;
y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
}
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
{
const int i = get_group_id(0); const int i = get_group_id(0);
const int tid = get_local_id(0); const int tid = get_local_id(0);
const int ip = tid / 32; const int ip = tid / 32;
@ -272,32 +320,85 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K* x, __globa
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
} }
void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
int n = iqs / 128;
int r = iqs - 128 * n;
int l = r / 8;
__global const float *y = yy + 128 * n + l;
__global const uint8_t *q = x[ib].qs + 32 * n + l;
__global const uint8_t *s = x[ib].scales + 8 * n;
const float dall = vload_half(0, &x[ib].d);
const float dmin = vload_half(0, &x[ib].dmin);
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
*result = sum;
}
void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
uint32_t aux[3];
uint32_t utmp[4];
int n = iqs/128;
int r = iqs - 128*n;
int l = r/8;
__global const float * y = yy + 128*n + l;
__global const uint8_t * q = x[ib].qs + 32*n + l;
__global const uint8_t * hm = x[ib].hmask + l;
const int8_t * s = (const int8_t *)utmp + 8*n;
aux[0] |= x[ib].scales[0];
aux[0] |= x[ib].scales[1] << 8;
aux[0] |= x[ib].scales[2] << 16;
aux[0] |= x[ib].scales[3] << 24;
aux[1] |= x[ib].scales[4];
aux[1] |= x[ib].scales[5] << 8;
aux[1] |= x[ib].scales[6] << 16;
aux[1] |= x[ib].scales[7] << 24;
aux[2] |= x[ib].scales[8];
aux[2] |= x[ib].scales[9] << 8;
aux[2] |= x[ib].scales[10] << 16;
aux[2] |= x[ib].scales[11] << 24;
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
const float dall = vload_half(0, &x[ib].d);
const uint8_t m = 1 << (4*n);
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
*result = sum * dall;
}
); );
// __kernel void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, __global float *result) {
// int n = iqs / 128; // 0 or 1
// int r = iqs - 128 * n; // 0...120 in steps of 8
// int l = r / 8; // 0...15 in steps of 1
// __global const float *y = yy + 128 * n + l;
// __global const uchar *q = x[ib].qs + 32 * n + l;
// __global const uchar *s = x[ib].scales + 8 * n;
// const float dall = vload_half(0, &x[ib].d);
// const float dmin = vload_half(0, &x[ib].dmin);
// float sum = y[0] * (dall * ((s[0] & 0xF) * ((q[0] >> 0) & 3)) - dmin * (s[0] >> 4))
// + y[32] * (dall * ((s[2] & 0xF) * ((q[0] >> 2) & 3)) - dmin * (s[2] >> 4))
// + y[64] * (dall * ((s[4] & 0xF) * ((q[0] >> 4) & 3)) - dmin * (s[4] >> 4))
// + y[96] * (dall * ((s[6] & 0xF) * ((q[0] >> 6) & 3)) - dmin * (s[6] >> 4))
// + y[16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
// + y[48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
// + y[80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
// + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
// *result = sum;
// }
std::string dequant_template = MULTILINE_QUOTE( std::string dequant_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
@ -365,44 +466,44 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
} }
); );
// std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE( std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
// __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
// const int block_size = get_local_size(0); const int block_size = get_local_size(0);
// const int row = get_global_id(0) / block_size; const int row = get_global_id(0) / block_size;
// const int tid = get_local_id(0); const int tid = get_local_id(0);
// const int iter_stride = 256; const int iter_stride = 256;
// const int vals_per_iter = iter_stride; const int vals_per_iter = iter_stride;
// const int num_blocks_per_row = ncols / 256; const int num_blocks_per_row = ncols / 256;
// const int ib0 = row*num_blocks_per_row; const int ib0 = row*num_blocks_per_row;
// tmp[tid] = 0; tmp[tid] = 0;
// for (int i = 0; i < ncols; i += iter_stride) { for (int i = 0; i < ncols; i += iter_stride) {
// const int col = i + vals_per_iter*tid; const int col = i + vals_per_iter*tid;
// const int ib = ib0 + col/QK_K; // x block index const int ib = ib0 + col/256; // x block index
// const int iqs = col%QK_K; // x quant index const int iqs = col%256; // x quant index
// const int iybs = col - col%QK_K; // y block start index const int iybs = col - col%256; // y block start index
// // dequantize // dequantize
// float v; float v;
// dot_kernel(vx, ib, iqs, y + iybs, v); dot_kernel(x, ib, iqs, y + iybs, &v);
// tmp += v; tmp[tid] += v;
// } }
// // sum up partial sums and write back result // sum up partial sums and write back result
// barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// for (int s=block_size/2; s>0; s>>=1) { for (int s=block_size/2; s>0; s>>=1) {
// if (tid < s) { if (tid < s) {
// tmp[tid] += tmp[tid + s]; tmp[tid] += tmp[tid + s];
// } }
// barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// } }
// if (tid == 0) { if (tid == 0) {
// dst[row] = tmp[0]; dst[row] = tmp[0];
// } }
// } }
// ); );
std::string mul_template = MULTILINE_QUOTE( std::string mul_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
@ -465,6 +566,11 @@ std::array<std::string, 2> mul_str_values = {
"mul_f32", "float" "mul_f32", "float"
}; };
std::array<std::string, 6> dmmv_k_str_values = {
"dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
"dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
};
std::string& replace(std::string& s, const std::string& from, const std::string& to) { std::string& replace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0; size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) { while ((pos = s.find(from, pos)) != std::string::npos) {
@ -494,6 +600,15 @@ std::string generate_kernels() {
} }
src << mul_kernel << '\n'; src << mul_kernel << '\n';
} }
for (size_t i = 0; i < dmmv_k_str_values.size(); i += 3) {
std::string dmmv_kernel = dequant_mul_mat_vec_k_template;
//just apply quick template fn name replacement for the K quant DMMVs since sizes are known
replace(dmmv_kernel, "KERNEL_NAME", dmmv_k_str_values[i]);
replace(dmmv_kernel, "X_TYPE", dmmv_k_str_values[i + 1]);
replace(dmmv_kernel, "dot_kernel", dmmv_k_str_values[i + 2]);
src << dmmv_kernel << '\n';
}
return src.str(); return src.str();
} }
@ -505,7 +620,8 @@ static cl_program program;
static cl_kernel convert_row_f16_cl; static cl_kernel convert_row_f16_cl;
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q6_k_cl; static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
static cl_kernel mul_f32_cl; static cl_kernel mul_f32_cl;
static bool fp16_support; static bool fp16_support;
@ -739,6 +855,7 @@ void ggml_cl_init(void) {
CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
// dequant mul mat kernel // dequant mul mat kernel
@ -748,7 +865,8 @@ void ggml_cl_init(void) {
CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err)); CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
// mul kernel // mul kernel
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
@ -772,6 +890,8 @@ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
return &dequantize_block_q3_k_cl; return &dequantize_block_q3_k_cl;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
return &dequantize_block_q4_k_cl; return &dequantize_block_q4_k_cl;
case GGML_TYPE_Q5_K:
return &dequantize_block_q5_k_cl;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
return &dequantize_block_q6_k_cl; return &dequantize_block_q6_k_cl;
case GGML_TYPE_F16: case GGML_TYPE_F16:
@ -853,6 +973,16 @@ static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
return &dequantize_mul_mat_vec_q8_0_cl; return &dequantize_mul_mat_vec_q8_0_cl;
case GGML_TYPE_F16: case GGML_TYPE_F16:
return &convert_mul_mat_vec_f16_cl; return &convert_mul_mat_vec_f16_cl;
case GGML_TYPE_Q2_K:
return &dequantize_mul_mat_vec_q2_K_cl;
case GGML_TYPE_Q3_K:
return &dequantize_mul_mat_vec_q3_K_cl;
case GGML_TYPE_Q4_K:
return &dequantize_mul_mat_vec_q4_K_cl;
case GGML_TYPE_Q5_K:
return &dequantize_mul_mat_vec_q5_K_cl;
case GGML_TYPE_Q6_K:
return &dequantize_mul_mat_vec_q6_K_cl;
default: default:
return nullptr; return nullptr;
} }