Merge branch 'optimize_quants_upstream' into concedo_experimental
This commit is contained in:
commit
d754915269
1 changed files with 174 additions and 15 deletions
189
ggml-opencl.cpp
189
ggml-opencl.cpp
|
@ -22,11 +22,19 @@
|
||||||
|
|
||||||
#define CL_DMMV_BLOCK_SIZE 32
|
#define CL_DMMV_BLOCK_SIZE 32
|
||||||
|
|
||||||
|
#ifndef K_QUANTS_PER_ITERATION
|
||||||
|
#define K_QUANTS_PER_ITERATION 2
|
||||||
|
#else
|
||||||
|
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||||
static std::string program_source = MULTILINE_QUOTE(
|
static std::string program_source = MULTILINE_QUOTE(
|
||||||
|
|
||||||
typedef char int8_t;
|
typedef char int8_t;
|
||||||
typedef uchar uint8_t;
|
typedef uchar uint8_t;
|
||||||
|
typedef short int16_t;
|
||||||
|
typedef ushort uint16_t;
|
||||||
typedef int int32_t;
|
typedef int int32_t;
|
||||||
typedef uint uint32_t;
|
typedef uint uint32_t;
|
||||||
|
|
||||||
|
@ -200,7 +208,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
||||||
const int is = 8 * n + l / 16;
|
const int is = 8 * n + l / 16;
|
||||||
|
|
||||||
const uint8_t q = x[i].qs[32 * n + l];
|
const uint8_t q = x[i].qs[32 * n + l];
|
||||||
__global float *y = yy + i * 256 + 128 * n;
|
__global float *y = yy + i * QK_K + 128 * n;
|
||||||
|
|
||||||
const float dall = vload_half(0, &x[i].d);
|
const float dall = vload_half(0, &x[i].d);
|
||||||
const float dmin = vload_half(0, &x[i].dmin);
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
@ -232,7 +240,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
||||||
float d_all = vload_half(0, &x[i].d);
|
float d_all = vload_half(0, &x[i].d);
|
||||||
float dl = d_all * (us - 32);
|
float dl = d_all * (us - 32);
|
||||||
|
|
||||||
__global float *y = yy + i * 256 + 128 * n + 32 * j;
|
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
|
||||||
const __global uint8_t *q = x[i].qs + 32 * n;
|
const __global uint8_t *q = x[i].qs + 32 * n;
|
||||||
const __global uint8_t *hm = x[i].hmask;
|
const __global uint8_t *hm = x[i].hmask;
|
||||||
|
|
||||||
|
@ -249,7 +257,7 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
||||||
const int is = 2 * il;
|
const int is = 2 * il;
|
||||||
const int n = 4;
|
const int n = 4;
|
||||||
|
|
||||||
__global float *y = yy + i * 256 + 64 * il + n * ir;
|
__global float *y = yy + i * QK_K + 64 * il + n * ir;
|
||||||
|
|
||||||
const float dall = vload_half(0, &x[i].d);
|
const float dall = vload_half(0, &x[i].d);
|
||||||
const float dmin = vload_half(0, &x[i].dmin);
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
@ -278,7 +286,7 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
||||||
const int ir = tid % 16;
|
const int ir = tid % 16;
|
||||||
const int is = 2 * il;
|
const int is = 2 * il;
|
||||||
|
|
||||||
__global float *y = yy + i * 256 + 64 * il + 2 * ir;
|
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
|
||||||
|
|
||||||
const float dall = vload_half(0, &x[i].d);
|
const float dall = vload_half(0, &x[i].d);
|
||||||
const float dmin = vload_half(0, &x[i].dmin);
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
@ -310,7 +318,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
|
||||||
const int il = tid - 32 * ip;
|
const int il = tid - 32 * ip;
|
||||||
const int is = 8 * ip + il / 16;
|
const int is = 8 * ip + il / 16;
|
||||||
|
|
||||||
__global float *y = yy + i * 256 + 128 * ip + il;
|
__global float *y = yy + i * QK_K + 128 * ip + il;
|
||||||
|
|
||||||
const float d = vload_half(0, &x[i].d);
|
const float d = vload_half(0, &x[i].d);
|
||||||
|
|
||||||
|
@ -421,6 +429,80 @@ void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int i
|
||||||
*result = sum;
|
*result = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__kernel void dequantize_mul_mat_vec_q4_K_fast(__global struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
||||||
|
|
||||||
|
//to rename it later, just to test now
|
||||||
|
const uint16_t kmask1 = 0x3f3f;
|
||||||
|
const uint16_t kmask2 = 0x0f0f;
|
||||||
|
const uint16_t kmask3 = 0xc0c0;
|
||||||
|
|
||||||
|
const int row = get_group_id(0);
|
||||||
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
||||||
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
||||||
|
|
||||||
|
const int step = 8/K_QUANTS_PER_ITERATION;
|
||||||
|
|
||||||
|
const int il = tid/step; // 0...3
|
||||||
|
const int ir = tid - step*il;// 0...3
|
||||||
|
const int n = 2*K_QUANTS_PER_ITERATION;
|
||||||
|
|
||||||
|
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||||
|
const int in = il%2;
|
||||||
|
|
||||||
|
const int l0 = n*(2*ir + in);
|
||||||
|
const int q_offset = 32*im + l0;
|
||||||
|
const int y_offset = 64*im + l0;
|
||||||
|
|
||||||
|
uint16_t aux[4];
|
||||||
|
const uint8_t * sc = (const uint8_t *)aux;
|
||||||
|
|
||||||
|
const struct block_q4_K * x = xx + ib0;
|
||||||
|
|
||||||
|
tmp[16 * ix + tid] = 0;
|
||||||
|
|
||||||
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
|
const uint8_t * q1 = x[i].qs + q_offset;
|
||||||
|
const uint8_t * q2 = q1 + 64;
|
||||||
|
const float * y1 = yy + i*QK_K + y_offset;
|
||||||
|
const float * y2 = y1 + 128;
|
||||||
|
|
||||||
|
const float dall = vload_half(0, &x[i].d);
|
||||||
|
const float dmin = vload_half(0, &x[i].dmin);
|
||||||
|
|
||||||
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||||
|
aux[0] = a[im+0] & kmask1;
|
||||||
|
aux[1] = a[im+2] & kmask1;
|
||||||
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
||||||
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
||||||
|
|
||||||
|
float4 s = (float4)(0.f);
|
||||||
|
float smin = 0;
|
||||||
|
for (int l = 0; l < n; ++l) {
|
||||||
|
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
||||||
|
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
||||||
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
||||||
|
}
|
||||||
|
tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums and write back result
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int s=16; s>0; s>>=1) {
|
||||||
|
if (tid < s) {
|
||||||
|
tmp[tid] += tmp[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
if (tid == 0) {
|
||||||
|
dst[row] = tmp[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
|
void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
|
||||||
|
|
||||||
const int j = iqs / 64;
|
const int j = iqs / 64;
|
||||||
|
@ -481,6 +563,82 @@ void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int i
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__kernel void dequantize_mul_mat_vec_q6_K_fast(__global struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
|
||||||
|
|
||||||
|
const int row = get_group_id(0);
|
||||||
|
|
||||||
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
|
const struct block_q6_K * x = xx + ib0;
|
||||||
|
|
||||||
|
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||||
|
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||||
|
|
||||||
|
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
||||||
|
|
||||||
|
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||||
|
const int in = tid - step*im; // 0...15 or 0...7
|
||||||
|
|
||||||
|
#if K_QUANTS_PER_ITERATION == 1
|
||||||
|
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
||||||
|
const int is = 0;
|
||||||
|
#else
|
||||||
|
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
||||||
|
const int is = in / 4;
|
||||||
|
#endif
|
||||||
|
const int ql_offset = 64*im + l0;
|
||||||
|
const int qh_offset = 32*im + l0;
|
||||||
|
const int s_offset = 8*im + is;
|
||||||
|
const int y_offset = 128*im + l0;
|
||||||
|
|
||||||
|
tmp[16 * ix + tid] = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
|
const float * y = yy + i * QK_K + y_offset;
|
||||||
|
const uint8_t * ql = x[i].ql + ql_offset;
|
||||||
|
const uint8_t * qh = x[i].qh + qh_offset;
|
||||||
|
const int8_t * s = x[i].scales + s_offset;
|
||||||
|
|
||||||
|
const float d = vload_half(0, &x[i].d);
|
||||||
|
|
||||||
|
#if K_QUANTS_PER_ITERATION == 1
|
||||||
|
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
||||||
|
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
||||||
|
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
||||||
|
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
||||||
|
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
||||||
|
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
||||||
|
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
||||||
|
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
||||||
|
tmp[16 * ix + tid] += sum;
|
||||||
|
#else
|
||||||
|
float sum = 0;
|
||||||
|
for (int l = 0; l < 4; ++l) {
|
||||||
|
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
||||||
|
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
||||||
|
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
||||||
|
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
||||||
|
}
|
||||||
|
tmp[16 * ix + tid] += sum;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums and write back result
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for (int s=16; s>0; s>>=1) {
|
||||||
|
if (tid < s) {
|
||||||
|
tmp[tid] += tmp[tid + s];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
if (tid == 0) {
|
||||||
|
dst[row] = tmp[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
@ -556,18 +714,18 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
||||||
const int row = get_group_id(0);
|
const int row = get_group_id(0);
|
||||||
const int tid = get_local_id(0);
|
const int tid = get_local_id(0);
|
||||||
|
|
||||||
const int iter_stride = 256;
|
const int iter_stride = QK_K;
|
||||||
const int vals_per_iter = iter_stride / block_size;
|
const int vals_per_iter = iter_stride / block_size;
|
||||||
const int num_blocks_per_row = ncols / 256;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
|
|
||||||
tmp[tid] = 0;
|
tmp[tid] = 0;
|
||||||
|
|
||||||
for (int i = 0; i < ncols; i += iter_stride) {
|
for (int i = 0; i < ncols; i += iter_stride) {
|
||||||
const int col = i + vals_per_iter*tid;
|
const int col = i + vals_per_iter*tid;
|
||||||
const int ib = ib0 + col/256; // x block index
|
const int ib = ib0 + col/QK_K; // x block index
|
||||||
const int iqs = col%256; // x quant index
|
const int iqs = col%QK_K; // x quant index
|
||||||
const int iybs = col - col%256; // y block start index
|
const int iybs = col - col%QK_K; // y block start index
|
||||||
|
|
||||||
// dequantize
|
// dequantize
|
||||||
float v;
|
float v;
|
||||||
|
@ -732,10 +890,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
|
std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
|
||||||
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1";
|
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
|
||||||
|
"-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
|
||||||
|
|
||||||
err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
|
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
|
||||||
if(err < 0) {
|
if(err < 0) {
|
||||||
|
|
||||||
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||||
|
@ -964,9 +1123,9 @@ void ggml_cl_init(void) {
|
||||||
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K_fast", &err), err));
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
|
||||||
CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
|
CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K_fast", &err), err));
|
||||||
|
|
||||||
// mul kernel
|
// mul kernel
|
||||||
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue