diff --git a/Makefile b/Makefile
index f952ff126..8f57e406d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@ default: koboldcpp koboldcpp_noavx2 koboldcpp_openblas koboldcpp_openblas_noavx2
 simple: koboldcpp koboldcpp_noavx2
 tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox
 dev: koboldcpp_openblas
+dev2: koboldcpp_clblast
 
 
 ifndef UNAME_S
@@ -225,8 +226,8 @@ ggml_openblas_noavx2.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
 ggml_clblast.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
-ggml-opencl.o: ggml-opencl.c ggml-opencl.h
-	$(CC) $(CFLAGS) -c $< -o $@
+ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
+	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
 ggml-opencl-legacy.o: ggml-opencl-legacy.c ggml-opencl-legacy.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
@@ -248,6 +249,9 @@ expose.o: expose.cpp expose.h
 gpttype_adapter.o: gpttype_adapter.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+gpttype_adapter_clblast.o: gpttype_adapter.cpp
+	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
+
 clean:
 	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_noavx2.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so gptj.exe gpt2.exe
 
@@ -273,7 +277,7 @@ koboldcpp_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapt
 koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o 
 	$(OPENBLAS_NOAVX2_BUILD)
 
-koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-opencl.o ggml-opencl-legacy.o
+koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml-opencl-legacy.o
 	$(CLBLAST_BUILD)
 		
 quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
diff --git a/ggml-opencl-legacy.h b/ggml-opencl-legacy.h
index 588a5bab6..ff7f4f0c2 100644
--- a/ggml-opencl-legacy.h
+++ b/ggml-opencl-legacy.h
@@ -6,6 +6,17 @@
 extern "C" {
 #endif
 
+enum ggml_blas_order {
+    GGML_BLAS_ORDER_ROW_MAJOR = 101,
+    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
+};
+
+enum ggml_blas_op {
+    GGML_BLAS_OP_N = 111,
+    GGML_BLAS_OP_T = 112,
+    GGML_BLAS_OP_C = 113,
+};
+
 void ggml_cl_init_legacy(void);
 
 void ggml_cl_sgemm_wrapper_legacy(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
new file mode 100644
index 000000000..1367408b8
--- /dev/null
+++ b/ggml-opencl.cpp
@@ -0,0 +1,1092 @@
+#include "ggml-opencl.h"
+
+#include <atomic>
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast.h>
+#include <clblast_c.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
+#define CL_DMMV_BLOCK_SIZE 32;
+
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+const char * clblast_dequant = MULTILINE_QUOTE(
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+constant uint GGML_TYPE_Q4_0 = 2;
+constant uint GGML_TYPE_Q4_1 = 3;
+constant uint GGML_TYPE_Q5_0 = 6;
+constant uint GGML_TYPE_Q5_1 = 7;
+constant uint GGML_TYPE_Q8_0 = 8;
+constant uint GGML_TYPE_Q8_1 = 9;
+
+constant uint QK4_0 = 32;
+constant uint QR4_0 = 2;
+struct block_q4_0
+{
+    float d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+constant uint QK4_1 = 32;
+constant uint QR4_1 = 2;
+struct block_q4_1
+{
+    float d;
+    float m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+constant uint QK5_0 = 32;
+constant uint QR5_0 = 2;
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+
+constant uint QK5_1 = 32;
+constant uint QR5_1 = 2;
+struct block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+
+constant uint QK8_0 = 32;
+constant uint QR8_0 = 1;
+struct block_q8_0
+{
+    float d;
+    uint8_t qs[QK8_0];
+};
+
+
+__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
+    const uint i = get_global_id(0);
+
+    y[i] = vload_half(0, &x[i]);
+}
+
+
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
+    constant uint qk = QK4_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+
+    const int x0 = (x[i].qs[j] & 0xf) - 8;
+    const int x1 = (x[i].qs[j] >>  4) - 8;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
+    constant uint qk = QK4_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const int x0 = (x[i].qs[j] & 0xf);
+    const int x1 = (x[i].qs[j] >>  4);
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
+    constant uint qk = QK5_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
+    constant uint qk = QK5_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+    const int x1 = (x[i].qs[j] >>  4) | xh_1;
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
+    constant uint qk = QK8_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    y[i*qk + j] = x[i].qs[j]*d;
+}
+
+void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = x[ib].d;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = (vi0 - 8)*d;
+    *v1 = (vi1 - 8)*d;
+}
+
+__kernel void dequantize_mul_mat_vec_q4_0(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_global_id(0) / block_size;
+    const int tid = get_local_id(0);
+
+    const uint qk = QK4_0;
+    const uint qr = QR4_0;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_q4_0(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = vi0*d + m;
+    *v1 = vi1*d + m;
+}
+__kernel void dequantize_mul_mat_vec_q4_1(__global struct block_q4_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_global_id(0) / block_size;
+    const int tid = get_local_id(0);
+
+    const uint qk = QK4_1;
+    const uint qr = QR4_1;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_q4_1(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, (__global half*) &x[ib].d);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+
+    *v0 = x0*d;
+    *v1 = x1*d;
+}
+__kernel void dequantize_mul_mat_vec_q5_0(__global struct block_q5_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_global_id(0) / block_size;
+    const int tid = get_local_id(0);
+
+    const uint qk = QK5_0;
+    const uint qr = QR5_0;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_q5_0(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, (__global half*) &x[ib].d);
+    const float m = vload_half(0, (__global half*) &x[ib].m);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    *v0 = x0*d + m;
+    *v1 = x1*d + m;
+}
+__kernel void dequantize_mul_mat_vec_q5_1(__global struct block_q5_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_global_id(0) / block_size;
+    const int tid = get_local_id(0);
+
+    const uint qk = QK5_1;
+    const uint qr = QR5_1;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_q5_1(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = x[ib].d;
+
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+
+    *v0 = vi0*d;
+    *v1 = vi1*d;
+}
+__kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_global_id(0) / block_size;
+    const int tid = get_local_id(0);
+
+    const uint qk = QK8_0;
+    const uint qr = QR8_0;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_q8_0(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
+    *v0 = vload_half(0, (__global half*) &x[ib + 0]);
+    *v1 = vload_half(0, (__global half*) &x[ib + 1]);
+}
+__kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_global_id(0) / block_size;
+    const int tid = get_local_id(0);
+
+    const uint qk = 32;
+    const uint qr = 1;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // convert
+        float v0, v1;
+        convert_f16(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+);
+
+#define CL_CHECK(err, name)                                                                     \
+    do {                                                                                        \
+        cl_int err_ = (err);                                                                    \
+        if (err_ != CL_SUCCESS) {                                                               \
+            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            exit(1);                                                                            \
+        }                                                                                       \
+    } while (0)
+
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel convert_fp16_to_fp32_cl;
+static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
+static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
+static bool fp16_support;
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size, log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+
+    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
+    if(err < 0) {
+
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        printf("%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+void ggml_cl_init(void) {
+    cl_int err = 0;
+    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
+    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
+    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
+    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
+    printf("\nInitializing CLBlast (First Run)...");
+    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
+    cl_uint num_platforms;
+    clGetPlatformIDs(0, NULL, &num_platforms);
+    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+    clGetPlatformIDs(num_platforms, platforms, NULL);
+    platform = platforms[plat_num];
+    char platform_buffer[1024];
+    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
+    cl_uint num_devices;
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+    device = devices[dev_num];
+    char device_buffer[1024];
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
+    size_t ext_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+    char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    // Check if ext_buffer contains cl_khr_fp16
+    for (size_t i = 0; i < ext_str_size - 12; i++) {
+        if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
+            fp16_support = true;
+            break;
+        }
+    }
+    free(ext_buffer);
+    printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support);
+    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    CL_CHECK(err, "clCreateContext");
+    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
+    CL_CHECK(err, "clCreateCommandQueue");
+
+    free(platforms);
+    free(devices);
+
+    program = build_program_from_source(context, device, clblast_dequant);
+
+    // FP16 to FP32 kernel
+    convert_fp16_to_fp32_cl = clCreateKernel(program, "convert_fp16_to_fp32", &err);
+    CL_CHECK(err, "clCreateKernel");
+
+    // Dequantize kernels
+    dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+
+    // dequant mul mat kernel
+    dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err);
+    CL_CHECK(err, "clCreateKernel");
+}
+
+static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return &dequantize_row_q4_0_cl;
+        case GGML_TYPE_Q4_1:
+            return &dequantize_row_q4_1_cl;
+        case GGML_TYPE_Q5_0:
+            return &dequantize_row_q5_0_cl;
+        case GGML_TYPE_Q5_1:
+            return &dequantize_row_q5_1_cl;
+        case GGML_TYPE_Q8_0:
+            return &dequantize_row_q8_0_cl;
+        case GGML_TYPE_F16:
+            return &convert_fp16_to_fp32_cl;
+        default:
+            return nullptr;
+    }
+}
+
+static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return &dequantize_mul_mat_vec_q4_0_cl;
+        case GGML_TYPE_Q4_1:
+            return &dequantize_mul_mat_vec_q4_1_cl;
+        case GGML_TYPE_Q5_0:
+            return &dequantize_mul_mat_vec_q5_0_cl;
+        case GGML_TYPE_Q5_1:
+            return &dequantize_mul_mat_vec_q5_1_cl;
+        case GGML_TYPE_Q8_0:
+            return &dequantize_mul_mat_vec_q8_0_cl;
+        case GGML_TYPE_F16:
+            return &convert_mul_mat_vec_f16_cl;
+        default:
+            return nullptr;
+    }
+}
+
+// buffer pool for cl
+#define MAX_CL_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cl_buffer {
+    cl_mem mem;
+    size_t size = 0;
+};
+
+static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
+static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
+
+static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+    cl_int err;
+
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer& b = g_cl_buffer_pool[i];
+        if (b.size > 0 && b.size >= size) {
+            cl_mem mem = b.mem;
+            *actual_size = b.size;
+            b.size = 0;
+            return mem;
+        }
+    }
+    cl_mem mem = clCreateBuffer(context, flags, size, NULL, &err);
+    CL_CHECK(err, "clCreateBuffer");
+    *actual_size = size;
+    return mem;
+}
+
+static void ggml_cl_pool_free(cl_mem mem, size_t size) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer& b = g_cl_buffer_pool[i];
+        if (b.size == 0) {
+            b.mem = mem;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
+    clReleaseMemObject(mem);
+}
+
+static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
+    cl_int err;
+    const uint64_t ne0 = src->ne[0];
+    const uint64_t ne1 = src->ne[1];
+    const uint64_t nb0 = src->nb[0];
+    const uint64_t nb1 = src->nb[1];
+    const uint64_t nb2 = src->nb[2];
+    const uint64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const size_t ts = ggml_type_size(type);
+    const size_t bs = ggml_blck_size(type);
+
+    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
+        return err;
+    }
+    if (nb0 == ts) {
+        const size_t buffer_origin[3] = { offset, 0, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { ts*ne0/bs, ne1, 1 };
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
+        return err;
+    }
+    for (uint64_t i1 = 0; i1 < ne1; i1++) {
+        // pretend the row is a matrix with cols=1
+        const size_t buffer_origin[3] = { offset, i1, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { ts/bs, ne0, 1 };
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
+        if (err != CL_SUCCESS) {
+            break;
+        }
+    }
+    return err;
+}
+
+static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    size_t x_size, y_size, d_size;
+    cl_mem d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY);
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+
+    cl_int err;
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            // copy data to device
+            err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
+            err |= ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL);
+            CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
+
+            CL_CHECK(clFinish(queue), "clFinish");
+
+            // compute
+            cl_event ev_sgemm;
+
+            clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                            (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                            ne01, ne11, ne10,
+                                            alpha,
+                                            d_X, 0, ne00,
+                                            d_Y, 0, ne10,
+                                            beta,
+                                            d_D, 0, ne01,
+                                            &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL);
+            CL_CHECK(err, "clEnqueueReadBuffer");
+        }
+    }
+
+    ggml_cl_pool_free(d_X, x_size);
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+    GGML_ASSERT(fp16_support);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
+    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    size_t x_size, y_size, d_size;
+    cl_mem d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+
+    cl_int err;
+
+    bool src1_cont_rows = nb10 == sizeof(float);
+    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            // copy src0 to device
+            err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
+            CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
+
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+            if (src1_cont_rows) {
+                if (src1_cont_cols) {
+                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                }
+                else {
+                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                    }
+                }
+            }
+            else {
+                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                        // very slow due to no inlining
+                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                    }
+                }
+            }
+
+            // copy src1 to device
+            err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL);
+            CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
+
+            CL_CHECK(clFinish(queue), "clFinish");
+
+            // compute
+            cl_event ev_sgemm;
+            clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                            (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                            ne01, ne11, ne10,
+                                            alpha,
+                                            d_X, 0, ne00,
+                                            d_Y, 0, ne10,
+                                            beta,
+                                            d_D, 0, ne01,
+                                            &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host, then convert to float
+            err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL);
+
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+
+            ggml_fp16_to_fp32_row(tmp, d, d_ne);
+        }
+    }
+
+    ggml_cl_pool_free(d_X, x_size);
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    const ggml_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
+
+    size_t x_size, y_size, d_size, q_size;
+    cl_mem d_X;
+    if (!mul_mat_vec) {
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
+    }
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+    cl_mem d_Q;
+    if (src0->backend == GGML_BACKEND_CPU) {
+        d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
+    }
+
+    cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
+    cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
+    GGML_ASSERT(to_fp32_cl != nullptr);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            cl_event ev_sgemm;
+
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
+            } else if (src0->backend == GGML_BACKEND_CL) {
+                d_Q = *(cl_mem*) src0->data;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                // copy src1 to device
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
+
+                // compute
+                // dequantize_mul_mat_vec(__global void * vx, __local float* tmp, __global float * y, __global float * dst, __global int ncols, __global int vx_type) {
+                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
+                const size_t local = CL_DMMV_BLOCK_SIZE;
+                const cl_int ncols = ne00;
+                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg");
+                CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL), "clSetKernelArg");
+                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg");
+                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D), "clSetKernelArg");
+                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols), "clSetKernelArg");
+                CL_CHECK(clFinish(queue), "clFinish");
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel");
+            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
+                // convert src0 to fp32 on device
+                const size_t global = x_ne;
+                const size_t local = ggml_blck_size(type) / 2;
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg");
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X), "clSetKernelArg");
+                CL_CHECK(clFinish(queue), "clFinish");
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, &local, 0, NULL, NULL), "clEnqueueNDRangeKernel");
+
+                // copy src1 to device
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
+
+                // wait for conversion
+                CL_CHECK(clFinish(queue), "clFinish");
+
+                // compute
+                clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
+                                            (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
+                                            ne01, ne11, ne10,
+                                            alpha,
+                                            d_X, 0, ne00,
+                                            d_Y, 0, ne10,
+                                            beta,
+                                            d_D, 0, ne01,
+                                            &queue, &ev_sgemm);
+
+                if (status != clblast::StatusCode::kSuccess) {
+                    GGML_ASSERT(false);
+                }
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL), "clEnqueueReadBuffer");
+            clReleaseEvent(ev_sgemm);
+        }
+    }
+
+    if (!mul_mat_vec) {
+        ggml_cl_pool_free(d_X, x_size);
+    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+    if (src0->backend == GGML_BACKEND_CPU) {
+        ggml_cl_pool_free(d_Q, q_size);
+    }
+}
+
+
+bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        src1->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32 &&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+    // If device doesn't support FP16
+    if (!fp16_support) {
+        return false;
+    }
+
+    size_t src0_sz = ggml_nbytes(src0);
+    size_t src1_sz = ggml_nbytes(src1);
+
+    // mul_mat_q: src0 is converted to fp32 on device
+    size_t mul_mat_q_transfer = src0_sz + src1_sz;
+
+    // mul_mat_f16: src1 is converted to fp16 on cpu
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
+
+    // choose the smaller one to transfer to the device
+    // TODO: this is not always the best choice due to the overhead of converting to fp16
+    return mul_mat_f16_transfer < mul_mat_q_transfer;
+}
+
+void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
+    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+
+    if (src0->type == GGML_TYPE_F32) {
+        ggml_cl_mul_mat_f32(src0, src1, dst);
+    }
+    else if (src0->type == GGML_TYPE_F16) {
+        if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
+        }
+        else {
+            ggml_cl_mul_mat_q_f32(src0, src1, dst);
+        }
+    }
+    else if (ggml_is_quantized(src0->type)) {
+        ggml_cl_mul_mat_q_f32(src0, src1, dst);
+    }
+    else {
+        GGML_ASSERT(false);
+    }
+}
+
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    }
+    return 0;
+}
+
+void ggml_cl_transform_tensor(ggml_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+
+    const ggml_type type = tensor->type;
+    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+
+    size_t q_size;
+    cl_mem* d_Q = (cl_mem*) malloc(sizeof(cl_mem));
+    *d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
+
+    // copy tensor to device
+    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *d_Q, 0, tensor, 0, 0, NULL), "ggml_cl_h2d_tensor_2d");
+    CL_CHECK(clFinish(queue), "clFinish");
+
+    tensor->data = d_Q;
+    tensor->backend = GGML_BACKEND_CL;
+}
diff --git a/ggml-opencl.h b/ggml-opencl.h
index 7bcc603ef..5a1a50093 100644
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -1,23 +1,21 @@
 #pragma once
 
+#include "ggml.h"
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
 void ggml_cl_init(void);
 
-enum ggml_blas_order {
-    GGML_BLAS_ORDER_ROW_MAJOR = 101,
-    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
-};
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
-enum ggml_blas_op {
-    GGML_BLAS_OP_N = 111,
-    GGML_BLAS_OP_T = 112,
-    GGML_BLAS_OP_C = 113,
-};
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);
 
-void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml.c b/ggml.c
index f99a250f1..fbf75d706 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9327,7 +9327,7 @@ static void ggml_compute_forward_rms_norm_back(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9368,7 +9368,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     const int64_t ne10 = src1->ne[0];
 #endif
     const int64_t ne11 = src1->ne[1];
@@ -9432,9 +9432,16 @@ static void ggml_compute_forward_mul_mat_f32(
         }
         return;
     }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -9454,33 +9461,11 @@ static void ggml_compute_forward_mul_mat_f32(
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                if(quants_unshuffled)
-                {
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-                }
-                else
-                {
-                ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                    ne11, ne01, ne10,
-                    1.0f,    y, ne10,
-                                x, ne10,
-                    0.0f,    d, ne01,
-                    GGML_TYPE_F32);
-                }
-#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
-#endif
             }
         }
         //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9619,9 +9604,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         }
         return;
     }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
 
@@ -9651,32 +9643,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                     assert(id*sizeof(float) <= params->wsize);
                 }
 
-#if defined(GGML_USE_CLBLAST)
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                if(quants_unshuffled)
-                {
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-                }
-                else
-                {
-                ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-                }
-#else
                 const float * x = wdata;
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
 
@@ -9688,7 +9654,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
-#endif
             }
         }
 
@@ -9851,9 +9816,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
         }
         return;
     }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -9876,9 +9848,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CLBLAST)
-                const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
-#else
                 {
                     size_t id = 0;
                     for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9890,35 +9859,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 }
 
                 const float * x = wdata;
-#endif
 
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                if(quants_unshuffled)
-                {
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        type);
-                }
-                else
-                {
-                ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        type);
-                }
-#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
-#endif
             }
         }
 
@@ -14024,9 +13970,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
                         }
                         else
+#elif defined(GGML_USE_CLBLAST)
+                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                //       the threads are still spinning
+                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+                        }
+                        else
 #endif
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
@@ -14040,13 +13993,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                             }
 #endif
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
diff --git a/ggml.h b/ggml.h
index e6bbe803e..f8400e39c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -250,6 +250,7 @@ extern "C" {
     enum ggml_backend {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_CUDA = 1,
+        GGML_BACKEND_CL = 2,
     };
 
     // model file types
diff --git a/include/clblast.h b/include/clblast.h
new file mode 100644
index 000000000..99364ec61
--- /dev/null
+++ b/include/clblast.h
@@ -0,0 +1,792 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the interface to the CLBlast BLAS routines. It also contains the definitions
+// of the returned status codes and the layout and transpose types. This is the only header users
+// of CLBlast should include and use.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_H_
+#define CLBLAST_CLBLAST_H_
+
+#include <cstdlib> // For size_t
+#include <string> // For OverrideParameters function
+#include <unordered_map> // For OverrideParameters function
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+  #include <OpenCL/opencl.h>
+#else
+  #include <CL/opencl.h>
+#endif
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
+#else
+  #define PUBLIC_API
+#endif
+
+// Version numbering (v1.5.3)
+#define CLBLAST_VERSION_MAJOR 1
+#define CLBLAST_VERSION_MINOR 5
+#define CLBLAST_VERSION_PATCH 3
+
+namespace clblast {
+// =================================================================================================
+
+// Status codes. These codes can be returned by functions declared in this header file. The error
+// codes match either the standard OpenCL error codes or the clBLAS error codes. 
+enum class StatusCode {
+
+  // Status codes in common with the OpenCL standard
+  kSuccess                   =   0, // CL_SUCCESS
+  kOpenCLCompilerNotAvailable=  -3, // CL_COMPILER_NOT_AVAILABLE
+  kTempBufferAllocFailure    =  -4, // CL_MEM_OBJECT_ALLOCATION_FAILURE
+  kOpenCLOutOfResources      =  -5, // CL_OUT_OF_RESOURCES
+  kOpenCLOutOfHostMemory     =  -6, // CL_OUT_OF_HOST_MEMORY
+  kOpenCLBuildProgramFailure = -11, // CL_BUILD_PROGRAM_FAILURE: OpenCL compilation error
+  kInvalidValue              = -30, // CL_INVALID_VALUE
+  kInvalidCommandQueue       = -36, // CL_INVALID_COMMAND_QUEUE
+  kInvalidMemObject          = -38, // CL_INVALID_MEM_OBJECT
+  kInvalidBinary             = -42, // CL_INVALID_BINARY
+  kInvalidBuildOptions       = -43, // CL_INVALID_BUILD_OPTIONS
+  kInvalidProgram            = -44, // CL_INVALID_PROGRAM
+  kInvalidProgramExecutable  = -45, // CL_INVALID_PROGRAM_EXECUTABLE
+  kInvalidKernelName         = -46, // CL_INVALID_KERNEL_NAME
+  kInvalidKernelDefinition   = -47, // CL_INVALID_KERNEL_DEFINITION
+  kInvalidKernel             = -48, // CL_INVALID_KERNEL
+  kInvalidArgIndex           = -49, // CL_INVALID_ARG_INDEX
+  kInvalidArgValue           = -50, // CL_INVALID_ARG_VALUE
+  kInvalidArgSize            = -51, // CL_INVALID_ARG_SIZE
+  kInvalidKernelArgs         = -52, // CL_INVALID_KERNEL_ARGS
+  kInvalidLocalNumDimensions = -53, // CL_INVALID_WORK_DIMENSION: Too many thread dimensions
+  kInvalidLocalThreadsTotal  = -54, // CL_INVALID_WORK_GROUP_SIZE: Too many threads in total
+  kInvalidLocalThreadsDim    = -55, // CL_INVALID_WORK_ITEM_SIZE: ... or for a specific dimension
+  kInvalidGlobalOffset       = -56, // CL_INVALID_GLOBAL_OFFSET
+  kInvalidEventWaitList      = -57, // CL_INVALID_EVENT_WAIT_LIST
+  kInvalidEvent              = -58, // CL_INVALID_EVENT
+  kInvalidOperation          = -59, // CL_INVALID_OPERATION
+  kInvalidBufferSize         = -61, // CL_INVALID_BUFFER_SIZE
+  kInvalidGlobalWorkSize     = -63, // CL_INVALID_GLOBAL_WORK_SIZE
+
+  // Status codes in common with the clBLAS library
+  kNotImplemented            = -1024, // Routine or functionality not implemented yet
+  kInvalidMatrixA            = -1022, // Matrix A is not a valid OpenCL buffer
+  kInvalidMatrixB            = -1021, // Matrix B is not a valid OpenCL buffer
+  kInvalidMatrixC            = -1020, // Matrix C is not a valid OpenCL buffer
+  kInvalidVectorX            = -1019, // Vector X is not a valid OpenCL buffer
+  kInvalidVectorY            = -1018, // Vector Y is not a valid OpenCL buffer
+  kInvalidDimension          = -1017, // Dimensions M, N, and K have to be larger than zero
+  kInvalidLeadDimA           = -1016, // LD of A is smaller than the matrix's first dimension
+  kInvalidLeadDimB           = -1015, // LD of B is smaller than the matrix's first dimension
+  kInvalidLeadDimC           = -1014, // LD of C is smaller than the matrix's first dimension
+  kInvalidIncrementX         = -1013, // Increment of vector X cannot be zero
+  kInvalidIncrementY         = -1012, // Increment of vector Y cannot be zero
+  kInsufficientMemoryA       = -1011, // Matrix A's OpenCL buffer is too small
+  kInsufficientMemoryB       = -1010, // Matrix B's OpenCL buffer is too small
+  kInsufficientMemoryC       = -1009, // Matrix C's OpenCL buffer is too small
+  kInsufficientMemoryX       = -1008, // Vector X's OpenCL buffer is too small
+  kInsufficientMemoryY       = -1007, // Vector Y's OpenCL buffer is too small
+
+  // Custom additional status codes for CLBlast
+  kInsufficientMemoryTemp    = -2050, // Temporary buffer provided to GEMM routine is too small
+  kInvalidBatchCount         = -2049, // The batch count needs to be positive
+  kInvalidOverrideKernel     = -2048, // Trying to override parameters for an invalid kernel
+  kMissingOverrideParameter  = -2047, // Missing override parameter(s) for the target kernel
+  kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
+  kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
+  kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
+  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
+  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
+  kDatabaseError             = -2041, // Entry for the device was not found in the database
+  kUnknownError              = -2040, // A catch-all error code representing an unspecified error
+  kUnexpectedError           = -2039, // A catch-all error code representing an unexpected exception
+};
+
+// Matrix layout and transpose types
+enum class Layout { kRowMajor = 101, kColMajor = 102 };
+enum class Transpose { kNo = 111, kYes = 112, kConjugate = 113 };
+enum class Triangle { kUpper = 121, kLower = 122 };
+enum class Diagonal { kNonUnit = 131, kUnit = 132 };
+enum class Side { kLeft = 141, kRight = 142 };
+enum class KernelMode { kCrossCorrelation = 151, kConvolution = 152 };
+
+// Precision scoped enum (values in bits)
+enum class Precision { kHalf = 16, kSingle = 32, kDouble = 64,
+                       kComplexSingle = 3232, kComplexDouble = 6464, kAny = -1 };
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Generate givens plane rotation: SROTG/DROTG
+template <typename T>
+StatusCode Rotg(cl_mem sa_buffer, const size_t sa_offset,
+                cl_mem sb_buffer, const size_t sb_offset,
+                cl_mem sc_buffer, const size_t sc_offset,
+                cl_mem ss_buffer, const size_t ss_offset,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+template <typename T>
+StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
+                 cl_mem sd2_buffer, const size_t sd2_offset,
+                 cl_mem sx1_buffer, const size_t sx1_offset,
+                 const cl_mem sy1_buffer, const size_t sy1_offset,
+                 cl_mem sparam_buffer, const size_t sparam_offset,
+                 cl_command_queue* queue, cl_event* event = nullptr);
+
+// Apply givens plane rotation: SROT/DROT
+template <typename T>
+StatusCode Rot(const size_t n,
+               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               const T cos,
+               const T sin,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+template <typename T>
+StatusCode Rotm(const size_t n,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem sparam_buffer, const size_t sparam_offset,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
+template <typename T>
+StatusCode Swap(const size_t n,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
+template <typename T>
+StatusCode Scal(const size_t n,
+                const T alpha,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
+template <typename T>
+StatusCode Copy(const size_t n,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
+template <typename T>
+StatusCode Axpy(const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Dot product of two vectors: SDOT/DDOT/HDOT
+template <typename T>
+StatusCode Dot(const size_t n,
+               cl_mem dot_buffer, const size_t dot_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Dot product of two complex vectors: CDOTU/ZDOTU
+template <typename T>
+StatusCode Dotu(const size_t n,
+                cl_mem dot_buffer, const size_t dot_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
+template <typename T>
+StatusCode Dotc(const size_t n,
+                cl_mem dot_buffer, const size_t dot_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
+template <typename T>
+StatusCode Nrm2(const size_t n,
+                cl_mem nrm2_buffer, const size_t nrm2_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
+template <typename T>
+StatusCode Asum(const size_t n,
+                cl_mem asum_buffer, const size_t asum_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
+template <typename T>
+StatusCode Sum(const size_t n,
+               cl_mem sum_buffer, const size_t sum_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+template <typename T>
+StatusCode Amax(const size_t n,
+                cl_mem imax_buffer, const size_t imax_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
+template <typename T>
+StatusCode Amin(const size_t n,
+                cl_mem imin_buffer, const size_t imin_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
+template <typename T>
+StatusCode Max(const size_t n,
+               cl_mem imax_buffer, const size_t imax_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
+template <typename T>
+StatusCode Min(const size_t n,
+               cl_mem imin_buffer, const size_t imin_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
+template <typename T>
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
+template <typename T>
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
+template <typename T>
+StatusCode Hemv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
+template <typename T>
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
+template <typename T>
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
+template <typename T>
+StatusCode Symv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
+template <typename T>
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
+template <typename T>
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
+template <typename T>
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
+template <typename T>
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
+template <typename T>
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
+template <typename T>
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
+template <typename T>
+StatusCode Tbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
+template <typename T>
+StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// General rank-1 matrix update: SGER/DGER/HGER
+template <typename T>
+StatusCode Ger(const Layout layout,
+               const size_t m, const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// General rank-1 complex matrix update: CGERU/ZGERU
+template <typename T>
+StatusCode Geru(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// General rank-1 complex conjugated matrix update: CGERC/ZGERC
+template <typename T>
+StatusCode Gerc(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian rank-1 matrix update: CHER/ZHER
+template <typename T>
+StatusCode Her(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian packed rank-1 matrix update: CHPR/ZHPR
+template <typename T>
+StatusCode Hpr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem ap_buffer, const size_t ap_offset,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian rank-2 matrix update: CHER2/ZHER2
+template <typename T>
+StatusCode Her2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
+template <typename T>
+StatusCode Hpr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem ap_buffer, const size_t ap_offset,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
+template <typename T>
+StatusCode Syr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
+template <typename T>
+StatusCode Spr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem ap_buffer, const size_t ap_offset,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
+template <typename T>
+StatusCode Syr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
+template <typename T>
+StatusCode Spr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem ap_buffer, const size_t ap_offset,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event = nullptr,
+                cl_mem temp_buffer = nullptr);
+
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Rank-K update of a hermitian matrix: CHERK/ZHERK
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const T beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event = nullptr);
+
+// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const U beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event = nullptr);
+
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event = nullptr);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
+template <typename T>
+StatusCode Had(const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               const T beta,
+               cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+               cl_command_queue* queue, cl_event* event = nullptr);
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event = nullptr);
+
+// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
+template <typename T>
+StatusCode Im2col(const KernelMode kernel_mode,
+                  const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const cl_mem im_buffer, const size_t im_offset,
+                  cl_mem col_buffer, const size_t col_offset,
+                  cl_command_queue* queue, cl_event* event = nullptr);
+
+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+template <typename T>
+StatusCode Col2im(const KernelMode kernel_mode,
+                  const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const cl_mem col_buffer, const size_t col_offset,
+                  cl_mem im_buffer, const size_t im_offset,
+                  cl_command_queue* queue, cl_event* event = nullptr);
+
+// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
+template <typename T>
+StatusCode Convgemm(const KernelMode kernel_mode,
+                    const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                    const cl_mem im_buffer, const size_t im_offset,
+                    const cl_mem kernel_buffer, const size_t kernel_offset,
+                    cl_mem result_buffer, const size_t result_offset,
+                    cl_command_queue* queue, cl_event* event = nullptr);
+
+// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event = nullptr);
+
+// Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event = nullptr);
+
+// StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED
+template <typename T>
+StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                              const size_t m, const size_t n, const size_t k,
+                              const T alpha,
+                              const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
+                              const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
+                              const T beta,
+                              cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
+                              const size_t batch_count,
+                              cl_command_queue* queue, cl_event* event = nullptr);
+
+// =================================================================================================
+
+// Retrieves the required size of the temporary buffer for the GEMM kernel (optional)
+template <typename T>
+StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                              const size_t m, const size_t n, const size_t k,
+                              const size_t a_offset, const size_t a_ld,
+                              const size_t b_offset, const size_t b_ld,
+                              const size_t c_offset, const size_t c_ld,
+                              cl_command_queue* queue, size_t& temp_buffer_size);
+
+// =================================================================================================
+
+// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
+// for the same device. This cache can be cleared to free up system memory or in case of debugging.
+StatusCode PUBLIC_API ClearCache();
+
+// The cache can also be pre-initialized for a specific device with all possible CLBlast kernels.
+// Further CLBlast routine calls will then run at maximum speed.
+StatusCode PUBLIC_API FillCache(const cl_device_id device);
+
+// =================================================================================================
+
+// Retrieves current tuning parameters for a specific device-precision-kernel combination
+StatusCode PUBLIC_API RetrieveParameters(const cl_device_id device, const std::string &kernel_name,
+                                         const Precision precision,
+                                         std::unordered_map<std::string,size_t> &parameters);
+
+// Overrides tuning parameters for a specific device-precision-kernel combination. The next time
+// the target routine is called it will re-compile and use the new parameters from then on.
+StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                                         const Precision precision,
+                                         const std::unordered_map<std::string,size_t> &parameters);
+
+// =================================================================================================
+
+// Tunes the "Xaxpy" kernel, used for many level-1 routines such as XAXPY, XCOPY, and XSWAP
+template <typename T>
+StatusCode TuneXaxpy(cl_command_queue* queue, const size_t n,
+                     const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xdot" kernel, used for level-1 reduction routines such as XDOT, XMAX, and XSUM
+template <typename T>
+StatusCode TuneXdot(cl_command_queue* queue, const size_t n,
+                    const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xgemv" kernel, used for matrix-vector level-2 routines such as XGEMV, XGBMV, and XHEMV
+template <typename T>
+StatusCode TuneXgemv(cl_command_queue* queue, const size_t m, const size_t n,
+                     const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xger" kernel, used for matrix update level-2 routines such as XGER, XHER, and XSYR2
+template <typename T>
+StatusCode TuneXger(cl_command_queue* queue, const size_t m, const size_t n,
+                    const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xgemm" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode TuneXgemm(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
+                    const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "XgemmDiret" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode TuneXgemmDirect(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
+                           const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Copy" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode TuneCopy(cl_command_queue* queue, const size_t m, const size_t n,
+                    const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Pad" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode TunePad(cl_command_queue* queue, const size_t m, const size_t n,
+                   const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Transpose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode TuneTranspose(cl_command_queue* queue, const size_t m, const size_t n,
+                         const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Padtranspose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode TunePadtranspose(cl_command_queue* queue, const size_t m, const size_t n,
+                            const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xgemm" kernel, used for the level-3 routine XTRSM
+template <typename T>
+StatusCode TuneInvert(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
+                      const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// =================================================================================================
+
+} // namespace clblast
+
+// CLBLAST_CLBLAST_H_
+#endif
diff --git a/include/clblast_half.h b/include/clblast_half.h
new file mode 100644
index 000000000..b8de8537a
--- /dev/null
+++ b/include/clblast_half.h
@@ -0,0 +1,254 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides simple conversion operations between fp16 (half) and fp32 (float). These
+// conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and
+// are also part of the C++ half-precision header (http://half.sourceforge.net/).
+//
+// This file is pure C99.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_HALF_H_
+#define CLBLAST_HALF_H_
+
+// MSVC 2013 doesn't fully support C99
+#ifdef _MSC_VER
+    #define inline __inline
+#endif
+
+// =================================================================================================
+
+// The host data-type for half-precision floating-point (16-bit) is based on the `cl_half` OpenCL
+// type, which is a typedef for unsigned short.
+typedef unsigned short half;
+
+// 32-bit union for conversions
+typedef union ConversionBits_ {
+  unsigned int i32;
+  float f32;
+} ConversionBits;
+
+// =================================================================================================
+
+// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
+// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
+// mode.
+inline half FloatToHalf(const float value) {
+  static const unsigned short base_table[512] = { 
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+    0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+    0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+    0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+    0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00
+  };
+  static const unsigned char shift_table[512] = { 
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13
+  };
+  ConversionBits bits;
+  bits.f32 = value;
+  const unsigned short halfbits = base_table[bits.i32 >> 23] +
+                                  (unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]);
+  return halfbits;
+}
+
+// Converts a half-precision value to IEEE-compliant single-precision floating-point
+inline float HalfToFloat(const half value) {
+  static const unsigned int mantissa_table[2048] = { 
+    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
+    0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
+    0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
+    0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
+    0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
+    0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
+    0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
+    0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
+    0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
+    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
+    0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
+    0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
+    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
+    0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
+    0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
+    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
+    0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
+    0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
+    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
+    0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
+    0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
+    0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
+    0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
+    0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
+    0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
+    0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
+    0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
+    0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
+    0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
+    0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
+    0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
+    0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
+    0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
+    0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
+    0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
+    0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
+    0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
+    0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
+    0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
+    0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
+    0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
+    0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
+    0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
+    0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
+    0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
+    0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
+    0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
+    0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
+    0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
+    0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
+    0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
+    0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
+    0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
+    0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
+    0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
+    0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
+    0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
+    0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
+    0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
+    0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
+    0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
+    0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
+    0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
+    0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
+    0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
+    0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
+    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
+    0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
+    0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
+    0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
+    0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
+    0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
+    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
+    0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
+    0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
+    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
+    0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
+    0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
+    0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
+    0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
+    0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
+    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
+    0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
+    0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
+    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
+    0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
+    0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
+    0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
+    0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
+    0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
+    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
+    0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
+    0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
+    0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
+    0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
+    0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
+    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
+    0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
+    0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
+    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
+    0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
+    0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
+    0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
+    0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
+    0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
+    0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
+    0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
+    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
+    0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
+    0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
+    0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
+    0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
+    0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
+    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
+    0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
+    0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
+    0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
+    0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
+    0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
+    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
+    0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
+    0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
+    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
+    0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
+    0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
+    0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
+    0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000
+  };
+  static const unsigned int exponent_table[64] = { 
+    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
+    0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
+    0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000
+  };
+  static const unsigned short offset_table[64] = { 
+    0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+    0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
+  };
+  ConversionBits bits;
+  bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] +
+             exponent_table[value >> 10];
+  return bits.f32;
+}
+
+// =================================================================================================
+
+// CLBLAST_HALF_H_
+#endif
diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h
new file mode 100644
index 000000000..4c54fb188
--- /dev/null
+++ b/include/clblast_netlib_c.h
@@ -0,0 +1,993 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer
+// copies automatically and running on the default OpenCL platform and device. For full control over
+// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_CLBLAST_NETLIB_C_H_
+#define CLBLAST_CLBLAST_NETLIB_C_H_
+
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#if defined(_WIN32) && defined(CLBLAST_DLL)
+  #if defined(COMPILING_DLL)
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
+#else
+  #define PUBLIC_API
+#endif
+
+// The C interface
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =================================================================================================
+
+// Matrix layout and transpose types
+typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101,
+                              CLBlastLayoutColMajor = 102 } CLBlastLayout;
+typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112,
+                                 CLBlastTransposeConjugate = 113 } CLBlastTranspose;
+typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121,
+                                CLBlastTriangleLower = 122 } CLBlastTriangle;
+typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
+                                CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
+typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
+typedef enum CLBlastKernelMode_ { CLBlastKernelModeCrossCorrelation = 141, CLBlastKernelModeConvolution = 152 } CLBlastKernelMode;
+
+// For full compatibility with CBLAS
+typedef CLBlastLayout CBLAS_ORDER;
+typedef CLBlastTranspose CBLAS_TRANSPOSE;
+typedef CLBlastTriangle CBLAS_UPLO;
+typedef CLBlastDiagonal CBLAS_DIAG;
+typedef CLBlastSide CBLAS_SIDE;
+#define CblasRowMajor CLBlastLayoutRowMajor
+#define CblasColMajor CLBlastLayoutColMajor
+#define CblasNoTrans CLBlastTransposeNo
+#define CblasTrans CLBlastTransposeYes
+#define CblasConjTrans CLBlastTransposeConjugate
+#define CblasUpper CLBlastTriangleUpper
+#define CblasLower CLBlastTriangleLower
+#define CblasNonUnit CLBlastDiagonalNonUnit
+#define CblasUnit CLBlastDiagonalUnit
+#define CblasLeft CLBlastSideLeft
+#define CblasRight CLBlastSideRight
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Generate givens plane rotation: SROTG/DROTG
+void PUBLIC_API cblas_srotg(float* sa,
+                            float* sb,
+                            float* sc,
+                            float* ss);
+void PUBLIC_API cblas_drotg(double* sa,
+                            double* sb,
+                            double* sc,
+                            double* ss);
+
+// Generate modified givens plane rotation: SROTMG/DROTMG
+void PUBLIC_API cblas_srotmg(float* sd1,
+                             float* sd2,
+                             float* sx1,
+                             const float sy1,
+                             float* sparam);
+void PUBLIC_API cblas_drotmg(double* sd1,
+                             double* sd2,
+                             double* sx1,
+                             const double sy1,
+                             double* sparam);
+
+// Apply givens plane rotation: SROT/DROT
+void PUBLIC_API cblas_srot(const int n,
+                           float* x, const int x_inc,
+                           float* y, const int y_inc,
+                           const float cos,
+                           const float sin);
+void PUBLIC_API cblas_drot(const int n,
+                           double* x, const int x_inc,
+                           double* y, const int y_inc,
+                           const double cos,
+                           const double sin);
+
+// Apply modified givens plane rotation: SROTM/DROTM
+void PUBLIC_API cblas_srotm(const int n,
+                            float* x, const int x_inc,
+                            float* y, const int y_inc,
+                            float* sparam);
+void PUBLIC_API cblas_drotm(const int n,
+                            double* x, const int x_inc,
+                            double* y, const int y_inc,
+                            double* sparam);
+
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
+void PUBLIC_API cblas_sswap(const int n,
+                            float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dswap(const int n,
+                            double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cswap(const int n,
+                            void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zswap(const int n,
+                            void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
+void PUBLIC_API cblas_sscal(const int n,
+                            const float alpha,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dscal(const int n,
+                            const double alpha,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_cscal(const int n,
+                            const void* alpha,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_zscal(const int n,
+                            const void* alpha,
+                            void* x, const int x_inc);
+
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
+void PUBLIC_API cblas_scopy(const int n,
+                            const float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dcopy(const int n,
+                            const double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_ccopy(const int n,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zcopy(const int n,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
+void PUBLIC_API cblas_saxpy(const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_daxpy(const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_caxpy(const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zaxpy(const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            void* y, const int y_inc);
+
+// Dot product of two vectors: SDOT/DDOT/HDOT
+float PUBLIC_API cblas_sdot(const int n,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc);
+double PUBLIC_API cblas_ddot(const int n,
+                             const double* x, const int x_inc,
+                             const double* y, const int y_inc);
+
+// Dot product of two complex vectors: CDOTU/ZDOTU
+void PUBLIC_API cblas_cdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+
+// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
+void PUBLIC_API cblas_cdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
+float PUBLIC_API cblas_snrm2(const int n,
+                             const float* x, const int x_inc);
+double PUBLIC_API cblas_dnrm2(const int n,
+                              const double* x, const int x_inc);
+float PUBLIC_API cblas_scnrm2(const int n,
+                             const void* x, const int x_inc);
+double PUBLIC_API cblas_dznrm2(const int n,
+                              const void* x, const int x_inc);
+
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
+float PUBLIC_API cblas_sasum(const int n,
+                             const float* x, const int x_inc);
+double PUBLIC_API cblas_dasum(const int n,
+                              const double* x, const int x_inc);
+float PUBLIC_API cblas_scasum(const int n,
+                             const void* x, const int x_inc);
+double PUBLIC_API cblas_dzasum(const int n,
+                              const void* x, const int x_inc);
+
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
+float PUBLIC_API cblas_ssum(const int n,
+                            const float* x, const int x_inc);
+double PUBLIC_API cblas_dsum(const int n,
+                             const double* x, const int x_inc);
+float PUBLIC_API cblas_scsum(const int n,
+                            const void* x, const int x_inc);
+double PUBLIC_API cblas_dzsum(const int n,
+                             const void* x, const int x_inc);
+
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
+int PUBLIC_API cblas_isamax(const int n,
+                           const float* x, const int x_inc);
+int PUBLIC_API cblas_idamax(const int n,
+                           const double* x, const int x_inc);
+int PUBLIC_API cblas_icamax(const int n,
+                           const void* x, const int x_inc);
+int PUBLIC_API cblas_izamax(const int n,
+                           const void* x, const int x_inc);
+
+// Index of absolute minimum value in a vector (non-BLAS function): iSAMIN/iDAMIN/iCAMIN/iZAMIN/iHAMIN
+int PUBLIC_API cblas_isamin(const int n,
+                           const float* x, const int x_inc);
+int PUBLIC_API cblas_idamin(const int n,
+                           const double* x, const int x_inc);
+int PUBLIC_API cblas_icamin(const int n,
+                           const void* x, const int x_inc);
+int PUBLIC_API cblas_izamin(const int n,
+                           const void* x, const int x_inc);
+
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
+int PUBLIC_API cblas_ismax(const int n,
+                          const float* x, const int x_inc);
+int PUBLIC_API cblas_idmax(const int n,
+                          const double* x, const int x_inc);
+int PUBLIC_API cblas_icmax(const int n,
+                          const void* x, const int x_inc);
+int PUBLIC_API cblas_izmax(const int n,
+                          const void* x, const int x_inc);
+
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
+int PUBLIC_API cblas_ismin(const int n,
+                          const float* x, const int x_inc);
+int PUBLIC_API cblas_idmin(const int n,
+                          const double* x, const int x_inc);
+int PUBLIC_API cblas_icmin(const int n,
+                          const void* x, const int x_inc);
+int PUBLIC_API cblas_izmin(const int n,
+                          const void* x, const int x_inc);
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
+void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
+void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                            const int m, const int n, const int kl, const int ku,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian matrix-vector multiplication: CHEMV/ZHEMV
+void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV
+void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV
+void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* ap,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* ap,
+                            const void* x, const int x_inc,
+                            const void* beta,
+                            void* y, const int y_inc);
+
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
+void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
+void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
+void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* ap,
+                            const float* x, const int x_inc,
+                            const float beta,
+                            float* y, const int y_inc);
+void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* ap,
+                            const double* x, const int x_inc,
+                            const double beta,
+                            double* y, const int y_inc);
+
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
+void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
+void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
+void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* ap,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* ap,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+
+// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
+void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV
+void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const float* a, const int a_ld,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const double* a, const int a_ld,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n, const int k,
+                            const void* a, const int a_ld,
+                            void* x, const int x_inc);
+
+// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV
+void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const float* ap,
+                            float* x, const int x_inc);
+void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const double* ap,
+                            double* x, const int x_inc);
+void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int n,
+                            const void* ap,
+                            void* x, const int x_inc);
+
+// General rank-1 matrix update: SGER/DGER/HGER
+void PUBLIC_API cblas_sger(const CLBlastLayout layout,
+                           const int m, const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           const float* y, const int y_inc,
+                           float* a, const int a_ld);
+void PUBLIC_API cblas_dger(const CLBlastLayout layout,
+                           const int m, const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           const double* y, const int y_inc,
+                           double* a, const int a_ld);
+
+// General rank-1 complex matrix update: CGERU/ZGERU
+void PUBLIC_API cblas_cgeru(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zgeru(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// General rank-1 complex conjugated matrix update: CGERC/ZGERC
+void PUBLIC_API cblas_cgerc(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zgerc(const CLBlastLayout layout,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// Hermitian rank-1 matrix update: CHER/ZHER
+void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const void* x, const int x_inc,
+                           void* a, const int a_ld);
+void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const void* x, const int x_inc,
+                           void* a, const int a_ld);
+
+// Hermitian packed rank-1 matrix update: CHPR/ZHPR
+void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const void* x, const int x_inc,
+                           void* ap);
+void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const void* x, const int x_inc,
+                           void* ap);
+
+// Hermitian rank-2 matrix update: CHER2/ZHER2
+void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* a, const int a_ld);
+
+// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2
+void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* ap);
+void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const void* alpha,
+                            const void* x, const int x_inc,
+                            const void* y, const int y_inc,
+                            void* ap);
+
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
+void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           double* a, const int a_ld);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
+void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           float* ap);
+void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                           const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           double* ap);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
+void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc,
+                            float* a, const int a_ld);
+void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            const double* y, const int y_inc,
+                            double* a, const int a_ld);
+
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
+void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const float alpha,
+                            const float* x, const int x_inc,
+                            const float* y, const int y_inc,
+                            float* ap);
+void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                            const int n,
+                            const double alpha,
+                            const double* x, const int x_inc,
+                            const double* y, const int y_inc,
+                            double* ap);
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
+void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* b, const int b_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* b, const int b_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                            const int m, const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
+void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float* b, const int b_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double* b, const int b_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
+void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* b, const int b_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
+void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            const float beta,
+                            float* c, const int c_ld);
+void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            const double beta,
+                            double* c, const int c_ld);
+void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            const void* beta,
+                            void* c, const int c_ld);
+
+// Rank-K update of a hermitian matrix: CHERK/ZHERK
+void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const float alpha,
+                            const void* a, const int a_ld,
+                            const float beta,
+                            void* c, const int c_ld);
+void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                            const int n, const int k,
+                            const double alpha,
+                            const void* a, const int a_ld,
+                            const double beta,
+                            void* c, const int c_ld);
+
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
+void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const float alpha,
+                             const float* a, const int a_ld,
+                             const float* b, const int b_ld,
+                             const float beta,
+                             float* c, const int c_ld);
+void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const double alpha,
+                             const double* a, const int a_ld,
+                             const double* b, const int b_ld,
+                             const double beta,
+                             double* c, const int c_ld);
+void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const void* beta,
+                             void* c, const int c_ld);
+void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const void* beta,
+                             void* c, const int c_ld);
+
+// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
+void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const float beta,
+                             void* c, const int c_ld);
+void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                             const int n, const int k,
+                             const void* alpha,
+                             const void* a, const int a_ld,
+                             const void* b, const int b_ld,
+                             const double beta,
+                             void* c, const int c_ld);
+
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
+void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            float* b, const int b_ld);
+void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            double* b, const int b_ld);
+void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
+void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const float alpha,
+                            const float* a, const int a_ld,
+                            float* b, const int b_ld);
+void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const double alpha,
+                            const double* a, const int a_ld,
+                            double* b, const int b_ld);
+void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                            const int m, const int n,
+                            const void* alpha,
+                            const void* a, const int a_ld,
+                            void* b, const int b_ld);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Element-wise vector product (Hadamard): SHAD/DHAD/CHAD/ZHAD/HHAD
+void PUBLIC_API cblas_shad(const int n,
+                           const float alpha,
+                           const float* x, const int x_inc,
+                           const float* y, const int y_inc,
+                           const float beta,
+                           float* z, const int z_inc);
+void PUBLIC_API cblas_dhad(const int n,
+                           const double alpha,
+                           const double* x, const int x_inc,
+                           const double* y, const int y_inc,
+                           const double beta,
+                           double* z, const int z_inc);
+void PUBLIC_API cblas_chad(const int n,
+                           const void* alpha,
+                           const void* x, const int x_inc,
+                           const void* y, const int y_inc,
+                           const void* beta,
+                           void* z, const int z_inc);
+void PUBLIC_API cblas_zhad(const int n,
+                           const void* alpha,
+                           const void* x, const int x_inc,
+                           const void* y, const int y_inc,
+                           const void* beta,
+                           void* z, const int z_inc);
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const float alpha,
+                                const float* a, const int a_ld,
+                                float* b, const int b_ld);
+void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const double alpha,
+                                const double* a, const int a_ld,
+                                double* b, const int b_ld);
+void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const void* alpha,
+                                const void* a, const int a_ld,
+                                void* b, const int b_ld);
+void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                const int m, const int n,
+                                const void* alpha,
+                                const void* a, const int a_ld,
+                                void* b, const int b_ld);
+
+// Im2col function (non-BLAS function): SIM2COL/DIM2COL/CIM2COL/ZIM2COL/HIM2COL
+void PUBLIC_API cblas_sim2col(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const float* im,
+                              float* col);
+void PUBLIC_API cblas_dim2col(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const double* im,
+                              double* col);
+void PUBLIC_API cblas_cim2col(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const void* im,
+                              void* col);
+void PUBLIC_API cblas_zim2col(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const void* im,
+                              void* col);
+
+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+void PUBLIC_API cblas_scol2im(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const float* col,
+                              float* im);
+void PUBLIC_API cblas_dcol2im(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const double* col,
+                              double* im);
+void PUBLIC_API cblas_ccol2im(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const void* col,
+                              void* im);
+void PUBLIC_API cblas_zcol2im(const CLBlastKernelMode kernel_mode,
+                              const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
+                              const void* col,
+                              void* im);
+
+// =================================================================================================
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+// CLBLAST_CLBLAST_NETLIB_C_H_
+#endif
diff --git a/koboldcpp.py b/koboldcpp.py
index 329cb5b08..4f1144b9e 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -200,7 +200,7 @@ maxctx = 2048
 maxlen = 128
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.22"
+KcppVersion = "1.23"
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
     sys_version = ""
diff --git a/llama.cpp b/llama.cpp
index b4c8a8baa..4dd86ff2b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11,6 +11,8 @@
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
 #endif
 
 #include <array>
@@ -1041,7 +1043,7 @@ static void llama_model_load_internal(
     ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
 
     model.mapping = std::move(ml->mapping);
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS)
     {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
@@ -1067,6 +1069,32 @@ static void llama_model_load_internal(
 
         fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
     }
+#elif defined(GGML_USE_CLBLAST)
+    {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
+
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+        }
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__);
+            ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        }
+
+        fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
 #else
     (void) n_gpu_layers;
 #endif