Merge branch 'opencl-dev2' into concedo_experimental

# Conflicts: # CMakeLists.txt
2023-05-16 17:04:33 +08:00 · 2023-05-16 17:04:33 +08:00 · 196fbba527
commit 196fbba527
parent 554340f565 342d346c13
4 changed files with 212 additions and 245 deletions
--- a/1
+++ b/1
@ -248,7 +248,6 @@ expose.o: expose.cpp expose.h
 gpttype_adapter.o: gpttype_adapter.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 gpttype_adapter_clblast.o: gpttype_adapter.cpp
 	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
--- a/ggml-opencl-legacy.h
+++ b/ggml-opencl-legacy.h
@ -6,17 +6,6 @@
 extern "C" {
 #endif
 enum ggml_blas_order {
    GGML_BLAS_ORDER_ROW_MAJOR = 101,
    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
 };
 enum ggml_blas_op {
    GGML_BLAS_OP_N = 111,
    GGML_BLAS_OP_T = 112,
    GGML_BLAS_OP_C = 113,
 };
 void ggml_cl_init_legacy(void);
 void ggml_cl_sgemm_wrapper_legacy(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1,6 +1,8 @@
 #include "ggml-opencl.h"
 #include <array>
 #include <atomic>
 #include <sstream>
 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>
@ -15,30 +17,30 @@
 #define CL_DMMV_BLOCK_SIZE 32;
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
-const char * clblast_dequant = MULTILINE_QUOTE(
+std::string program_source = MULTILINE_QUOTE(
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef int int32_t;
 typedef uint uint32_t;
-constant uint GGML_TYPE_Q4_0 = 2;
+const uint GGML_TYPE_Q4_0 = 2;
-constant uint GGML_TYPE_Q4_1 = 3;
+const uint GGML_TYPE_Q4_1 = 3;
-constant uint GGML_TYPE_Q5_0 = 6;
+const uint GGML_TYPE_Q5_0 = 6;
-constant uint GGML_TYPE_Q5_1 = 7;
+const uint GGML_TYPE_Q5_1 = 7;
-constant uint GGML_TYPE_Q8_0 = 8;
+const uint GGML_TYPE_Q8_0 = 8;
-constant uint GGML_TYPE_Q8_1 = 9;
+const uint GGML_TYPE_Q8_1 = 9;
-constant uint QK4_0 = 32;
+const uint  QK4_0 = 32;
-constant uint QR4_0 = 2;
+const uint  QR4_0 = 2;
 struct block_q4_0
 {
    float d;
    uint8_t qs[QK4_0 / 2];
 };
-constant uint QK4_1 = 32;
+const uint  QK4_1 = 32;
-constant uint QR4_1 = 2;
+const uint  QR4_1 = 2;
 struct block_q4_1
 {
    float d;
@ -46,8 +48,8 @@ struct block_q4_1
    uint8_t qs[QK4_1 / 2];
 };
-constant uint QK5_0 = 32;
+const uint  QK5_0 = 32;
-constant uint QR5_0 = 2;
+const uint  QR5_0 = 2;
 struct __attribute__ ((packed)) block_q5_0
 {
    half d;
@ -55,8 +57,8 @@ struct __attribute__ ((packed)) block_q5_0
    uint8_t qs[QK5_0 / 2];
 };
-constant uint QK5_1 = 32;
+const uint  QK5_1 = 32;
-constant uint QR5_1 = 2;
+const uint  QR5_1 = 2;
 struct block_q5_1
 {
    half d;
@ -65,8 +67,8 @@ struct block_q5_1
    uint8_t qs[QK5_1 / 2];
 };
-constant uint QK8_0 = 32;
+const uint  QK8_0 = 32;
-constant uint QR8_0 = 1;
+const uint  QR8_0 = 1;
 struct block_q8_0
 {
    float d;
@ -82,7 +84,7 @@ __kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
 __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
-    constant uint qk = QK4_0;
+    const uint  qk = QK4_0;
    const uint i = get_global_id(0) / qk;
    const uint j = get_local_id(0);
@ -97,7 +99,7 @@ __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float*
 }
 __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
-    constant uint qk = QK4_1;
+    const uint  qk = QK4_1;
    const uint i = get_global_id(0) / qk;
    const uint j = get_local_id(0);
@ -113,7 +115,7 @@ __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float*
 }
 __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
-    constant uint qk = QK5_0;
+    const uint  qk = QK5_0;
    const uint i = get_global_id(0) / qk;
    const uint j = get_local_id(0);
@ -133,7 +135,7 @@ __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float*
 }
 __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
-    constant uint qk = QK5_1;
+    const uint  qk = QK5_1;
    const uint i = get_global_id(0) / qk;
    const uint j = get_local_id(0);
@ -154,7 +156,7 @@ __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float*
 }
 __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
-    constant uint qk = QK8_0;
+    const uint  qk = QK8_0;
    const uint i = get_global_id(0) / qk;
    const uint j = get_local_id(0);
@ -173,47 +175,6 @@ void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const in
    *v0 = (vi0 - 8)*d;
    *v1 = (vi1 - 8)*d;
 }
 __kernel void dequantize_mul_mat_vec_q4_0(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
    const int tid = get_local_id(0);
    const uint qk = QK4_0;
    const uint qr = QR4_0;
    const int y_offset = qr == 1 ? 1 : qk/2;
    tmp[tid] = 0;
    for (int i = 0; i < ncols/block_size; i += 2) {
        const int col = i*block_size + 2*tid;
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
        // dequantize
        float v0, v1;
        dequantize_q4_0(x, ib, iqs, &v0, &v1);
        // matrix multiplication
        tmp[tid] += v0 * y[iybs + iqs + 0];
        tmp[tid] += v1 * y[iybs + iqs + y_offset];
    }
    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int s=block_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid == 0) {
        dst[row] = tmp[0];
    }
 }
 void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
    const float d = x[ib].d;
    const float m = x[ib].m;
@ -226,46 +187,6 @@ void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const in
    *v0 = vi0*d + m;
    *v1 = vi1*d + m;
 }
 __kernel void dequantize_mul_mat_vec_q4_1(__global struct block_q4_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
    const int tid = get_local_id(0);
    const uint qk = QK4_1;
    const uint qr = QR4_1;
    const int y_offset = qr == 1 ? 1 : qk/2;
    tmp[tid] = 0;
    for (int i = 0; i < ncols/block_size; i += 2) {
        const int col = i*block_size + 2*tid;
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
        // dequantize
        float v0, v1;
        dequantize_q4_1(x, ib, iqs, &v0, &v1);
        // matrix multiplication
        tmp[tid] += v0 * y[iybs + iqs + 0];
        tmp[tid] += v1 * y[iybs + iqs + y_offset];
    }
    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int s=block_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid == 0) {
        dst[row] = tmp[0];
    }
 }
 void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
    const float d = vload_half(0, (__global half*) &x[ib].d);
@ -280,46 +201,6 @@ void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const in
    *v0 = x0*d;
    *v1 = x1*d;
 }
 __kernel void dequantize_mul_mat_vec_q5_0(__global struct block_q5_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
    const int tid = get_local_id(0);
    const uint qk = QK5_0;
    const uint qr = QR5_0;
    const int y_offset = qr == 1 ? 1 : qk/2;
    tmp[tid] = 0;
    for (int i = 0; i < ncols/block_size; i += 2) {
        const int col = i*block_size + 2*tid;
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
        // dequantize
        float v0, v1;
        dequantize_q5_0(x, ib, iqs, &v0, &v1);
        // matrix multiplication
        tmp[tid] += v0 * y[iybs + iqs + 0];
        tmp[tid] += v1 * y[iybs + iqs + y_offset];
    }
    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int s=block_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid == 0) {
        dst[row] = tmp[0];
    }
 }
 void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
    const float d = vload_half(0, (__global half*) &x[ib].d);
    const float m = vload_half(0, (__global half*) &x[ib].m);
@ -335,46 +216,6 @@ void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const in
    *v0 = x0*d + m;
    *v1 = x1*d + m;
 }
 __kernel void dequantize_mul_mat_vec_q5_1(__global struct block_q5_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
    const int tid = get_local_id(0);
    const uint qk = QK5_1;
    const uint qr = QR5_1;
    const int y_offset = qr == 1 ? 1 : qk/2;
    tmp[tid] = 0;
    for (int i = 0; i < ncols/block_size; i += 2) {
        const int col = i*block_size + 2*tid;
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
        // dequantize
        float v0, v1;
        dequantize_q5_1(x, ib, iqs, &v0, &v1);
        // matrix multiplication
        tmp[tid] += v0 * y[iybs + iqs + 0];
        tmp[tid] += v1 * y[iybs + iqs + y_offset];
    }
    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int s=block_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid == 0) {
        dst[row] = tmp[0];
    }
 }
 void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
    const float d = x[ib].d;
@ -384,13 +225,20 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
    *v0 = vi0*d;
    *v1 = vi1*d;
 }
-__kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
    *v0 = vload_half(0, (__global half*) &x[ib + 0]);
    *v1 = vload_half(0, (__global half*) &x[ib + 1]);
 }
 );
 std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
    const int tid = get_local_id(0);
-    const uint qk = QK8_0;
+    const uint qk = QUANT_K;
-    const uint qr = QR8_0;
+    const uint qr = QUANT_R;
    const int y_offset = qr == 1 ? 1 : qk/2;
@ -404,51 +252,7 @@ __kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local
        // dequantize
        float v0, v1;
-        dequantize_q8_0(x, ib, iqs, &v0, &v1);
+        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
        // matrix multiplication
        tmp[tid] += v0 * y[iybs + iqs + 0];
        tmp[tid] += v1 * y[iybs + iqs + y_offset];
    }
    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int s=block_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (tid == 0) {
        dst[row] = tmp[0];
    }
 }
 void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
    *v0 = vload_half(0, (__global half*) &x[ib + 0]);
    *v1 = vload_half(0, (__global half*) &x[ib + 1]);
 }
 __kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
    const int block_size = get_local_size(0);
    const int row = get_global_id(0) / block_size;
    const int tid = get_local_id(0);
    const uint qk = 32;
    const uint qr = 1;
    const int y_offset = qr == 1 ? 1 : qk/2;
    tmp[tid] = 0;
    for (int i = 0; i < ncols/block_size; i += 2) {
        const int col = i*block_size + 2*tid;
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
        // convert
        float v0, v1;
        convert_f16(x, ib, iqs, &v0, &v1);
        // matrix multiplication
        tmp[tid] += v0 * y[iybs + iqs + 0];
@ -469,6 +273,41 @@ __kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __gl
 }
 );
 std::array<std::string, 5> dequant_mul_mat_vec_str_keys = {
    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
 };
 std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
    "convert_mul_mat_vec_f16", "half", "32", "1", "convert_f16"
 };
 static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) {
    size_t pos = 0;
    while ((pos = s.find(from, pos)) != std::string::npos) {
         s.replace(pos, from.length(), to);
         pos += to.length();
    }
    return s;
 }
 static std::string generate_kernels() {
    std::stringstream src;
    src << program_source << '\n';
    for (size_t i = 0; i < dequant_mul_mat_vec_str_values.size(); i += dequant_mul_mat_vec_str_keys.size()) {
        std::string kernel = dequant_mul_mat_vec_template;
        for (size_t j = 0; j < dequant_mul_mat_vec_str_keys.size(); j++) {
            sreplace(kernel, dequant_mul_mat_vec_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
        }
        src << kernel << '\n';
    }
    return src.str();
 }
 #define CL_CHECK(err, name)                                                                     \
    do {                                                                                        \
        cl_int err_ = (err);                                                                    \
@ -483,6 +322,8 @@ static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
 static cl_kernel convert_fp16_to_fp32_cl;
 static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
 static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
@ -560,7 +401,9 @@ void ggml_cl_init(void) {
    free(platforms);
    free(devices);
-    program = build_program_from_source(context, device, clblast_dequant);
+    std::string kernel_src = generate_kernels();
    program = build_program_from_source(context, device, kernel_src.c_str());
    // FP16 to FP32 kernel
    convert_fp16_to_fp32_cl = clCreateKernel(program, "convert_fp16_to_fp32", &err);
@ -593,6 +436,21 @@ void ggml_cl_init(void) {
    CL_CHECK(err, "clCreateKernel");
 }
 static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
    if (req_size <= *cur_size) {
        return;
    }
    // Reallocate buffer with enough space
    if (*cur_size > 0) {
        clReleaseMemObject(*buf);
    }
    cl_int err;
    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
    *cur_size = req_size;
    CL_CHECK(err, "clCreateBuffer");
 }
 static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
@ -1014,7 +872,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
+        (src0->backend == GGML_BACKEND_CL)) {
        return true;
    }
@ -1090,3 +948,111 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
    tensor->data = d_Q;
    tensor->backend = GGML_BACKEND_CL;
 }
 void ggml_cl_sgemm_wrapper(
        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
        const int m, const int n, const int k,
        const float alpha, const void *host_a, const int lda,
        const float *host_b, const int ldb, const float beta,
        float *host_c, const int ldc, const int btype) {
    cl_int err = 0;
    cl_kernel * kernel = ggml_get_to_fp32_cl((ggml_type)btype);
    size_t global = n * k, local, size_qb;
    bool dequant;
    switch (btype) {
    case GGML_TYPE_F32:
        dequant = false;
        break;
    case GGML_TYPE_Q4_0:
        dequant = true;
        local = 16;
        size_qb = global * (sizeof(float) + local) / 32;
        break;
    case GGML_TYPE_Q4_1:
        dequant = true;
        local = 16;
        size_qb = global * (sizeof(float) * 2 + local) / 32;
        break;
    case GGML_TYPE_Q5_0:
        dequant = true;
        local = 16;
        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
        break;
    case GGML_TYPE_Q5_1:
        dequant = true;
        local = 16;
        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
        break;
    case GGML_TYPE_Q8_0:
        dequant = true;
        local = 32;
        size_qb = global * (sizeof(float) + local) / 32;
        break;
    default:
        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
        abort();
    }
    const size_t size_a =  m * k * sizeof(float);
    const size_t size_b =  n * k * sizeof(float);
    const size_t size_c =  m * n * sizeof(float);
    // Prepare buffers
    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
    if (dequant) {
        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
    }
    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
    cl_event ev_a, ev_qb, ev_b;
    if (dequant) {
        err = clSetKernelArg(*kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
        err |= clSetKernelArg(*kernel, 1, sizeof(cl_mem), &cl_buffer_b);
        CL_CHECK(err, "clSetKernelArg");
        err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
        CL_CHECK(err, "clEnqueueWriteBuffer qb");
    } else {
        err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
        CL_CHECK(err, "clEnqueueWriteBuffer b");
    }
    err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
    CL_CHECK(err, "clEnqueueWriteBuffer a");
    if (dequant) {
        err = clEnqueueNDRangeKernel(queue, *kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
        CL_CHECK(err, "clEnqueueNDRangeKernel");
        clReleaseEvent(ev_qb);
    }
    clWaitForEvents(1, &ev_a);
    clWaitForEvents(1, &ev_b);
    clReleaseEvent(ev_a);
    clReleaseEvent(ev_b);
    cl_event ev_sgemm;
    CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
                                            (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
                                            m, n, k,
                                            alpha,
                                            cl_buffer_a, 0, lda,
                                            cl_buffer_b, 0, ldb,
                                            beta,
                                            cl_buffer_c, 0, ldc,
                                            &queue, &ev_sgemm);
    if (status != CLBlastSuccess) {
        fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
        abort();
    }
    cl_event ev_c;
    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
    // Wait for completion
    clWaitForEvents(1, &ev_c);
    clReleaseEvent(ev_sgemm);
    clReleaseEvent(ev_c);
 }
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -6,6 +6,17 @@
 extern "C" {
 #endif
 enum ggml_blas_order {
    GGML_BLAS_ORDER_ROW_MAJOR = 101,
    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
 };
 enum ggml_blas_op {
    GGML_BLAS_OP_N = 111,
    GGML_BLAS_OP_T = 112,
    GGML_BLAS_OP_C = 113,
 };
 void ggml_cl_init(void);
 bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
@ -17,6 +28,8 @@ void   ggml_cl_host_free(void * ptr);
 void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
 void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
 #ifdef  __cplusplus
 }
 #endif