From 1747c598fab95d95580a70fe0327348c1555d755 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 15 May 2023 19:51:23 +0200
Subject: [PATCH 1/2] Fix CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48e3238df..62e06f218 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -207,7 +207,7 @@ if (LLAMA_CLBLAST)
     if (CLBlast_FOUND)
         message(STATUS "CLBlast found")
 
-        set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+        set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
 
         add_compile_definitions(GGML_USE_CLBLAST)
 

From 342d346c13ffc8ac4227c32451d4731ebd91dbfd Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Tue, 16 May 2023 07:42:01 +0200
Subject: [PATCH 2/2] Generate dequant_mul_mat kernels from simple templates

---
 ggml-opencl.cpp | 263 ++++++++++--------------------------------------
 1 file changed, 52 insertions(+), 211 deletions(-)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index c842977b5..e99b85068 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1,6 +1,8 @@
 #include "ggml-opencl.h"
 
+#include <array>
 #include <atomic>
+#include <sstream>
 
 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>
@@ -14,7 +16,7 @@
 #define CL_DMMV_BLOCK_SIZE 32;
 
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
-const char * clblast_dequant = MULTILINE_QUOTE(
+std::string program_source = MULTILINE_QUOTE(
 
 typedef char int8_t;
 typedef uchar uint8_t;
@@ -172,47 +174,6 @@ void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const in
     *v0 = (vi0 - 8)*d;
     *v1 = (vi1 - 8)*d;
 }
-
-__kernel void dequantize_mul_mat_vec_q4_0(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
-    const int row = get_global_id(0) / block_size;
-    const int tid = get_local_id(0);
-
-    const uint qk = QK4_0;
-    const uint qr = QR4_0;
-
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    tmp[tid] = 0;
-
-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // dequantize
-        float v0, v1;
-        dequantize_q4_0(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
 void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
     const float d = x[ib].d;
     const float m = x[ib].m;
@@ -225,46 +186,6 @@ void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const in
     *v0 = vi0*d + m;
     *v1 = vi1*d + m;
 }
-__kernel void dequantize_mul_mat_vec_q4_1(__global struct block_q4_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
-    const int row = get_global_id(0) / block_size;
-    const int tid = get_local_id(0);
-
-    const uint qk = QK4_1;
-    const uint qr = QR4_1;
-
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    tmp[tid] = 0;
-
-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // dequantize
-        float v0, v1;
-        dequantize_q4_1(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
 void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
     const float d = vload_half(0, (__global half*) &x[ib].d);
 
@@ -279,46 +200,6 @@ void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const in
     *v0 = x0*d;
     *v1 = x1*d;
 }
-__kernel void dequantize_mul_mat_vec_q5_0(__global struct block_q5_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
-    const int row = get_global_id(0) / block_size;
-    const int tid = get_local_id(0);
-
-    const uint qk = QK5_0;
-    const uint qr = QR5_0;
-
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    tmp[tid] = 0;
-
-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // dequantize
-        float v0, v1;
-        dequantize_q5_0(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
 void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
     const float d = vload_half(0, (__global half*) &x[ib].d);
     const float m = vload_half(0, (__global half*) &x[ib].m);
@@ -334,46 +215,6 @@ void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const in
     *v0 = x0*d + m;
     *v1 = x1*d + m;
 }
-__kernel void dequantize_mul_mat_vec_q5_1(__global struct block_q5_1* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
-    const int row = get_global_id(0) / block_size;
-    const int tid = get_local_id(0);
-
-    const uint qk = QK5_1;
-    const uint qr = QR5_1;
-
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    tmp[tid] = 0;
-
-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // dequantize
-        float v0, v1;
-        dequantize_q5_1(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
 void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
     const float d = x[ib].d;
 
@@ -383,13 +224,20 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
     *v0 = vi0*d;
     *v1 = vi1*d;
 }
-__kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
+    *v0 = vload_half(0, (__global half*) &x[ib + 0]);
+    *v1 = vload_half(0, (__global half*) &x[ib + 1]);
+}
+);
+
+std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
     const int block_size = get_local_size(0);
     const int row = get_global_id(0) / block_size;
     const int tid = get_local_id(0);
 
-    const uint qk = QK8_0;
-    const uint qr = QR8_0;
+    const uint qk = QUANT_K;
+    const uint qr = QUANT_R;
 
     const int y_offset = qr == 1 ? 1 : qk/2;
 
@@ -403,51 +251,7 @@ __kernel void dequantize_mul_mat_vec_q8_0(__global struct block_q8_0* x, __local
 
         // dequantize
         float v0, v1;
-        dequantize_q8_0(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
-    *v0 = vload_half(0, (__global half*) &x[ib + 0]);
-    *v1 = vload_half(0, (__global half*) &x[ib + 1]);
-}
-__kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
-    const int row = get_global_id(0) / block_size;
-    const int tid = get_local_id(0);
-
-    const uint qk = 32;
-    const uint qr = 1;
-
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    tmp[tid] = 0;
-
-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // convert
-        float v0, v1;
-        convert_f16(x, ib, iqs, &v0, &v1);
+        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
 
         // matrix multiplication
         tmp[tid] += v0 * y[iybs + iqs + 0];
@@ -468,6 +272,41 @@ __kernel void convert_mul_mat_vec_f16(__global half* x, __local float* tmp, __gl
 }
 );
 
+std::array<std::string, 5> dequant_mul_mat_vec_str_keys = {
+    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
+};
+
+std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
+    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
+    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
+    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
+    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
+    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+    "convert_mul_mat_vec_f16", "half", "32", "1", "convert_f16"
+};
+
+std::string& replace(std::string& s, const std::string& from, const std::string& to) {
+    size_t pos = 0;
+    while ((pos = s.find(from, pos)) != std::string::npos) {
+         s.replace(pos, from.length(), to);
+         pos += to.length();
+    }
+    return s;
+}
+
+std::string generate_kernels() {
+    std::stringstream src;
+    src << program_source << '\n';
+    for (size_t i = 0; i < dequant_mul_mat_vec_str_values.size(); i += dequant_mul_mat_vec_str_keys.size()) {
+        std::string kernel = dequant_mul_mat_vec_template;
+        for (size_t j = 0; j < dequant_mul_mat_vec_str_keys.size(); j++) {
+            replace(kernel, dequant_mul_mat_vec_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
+        }
+        src << kernel << '\n';
+    }
+    return src.str();
+}
+
 #define CL_CHECK(err, name)                                                                     \
     do {                                                                                        \
         cl_int err_ = (err);                                                                    \
@@ -559,7 +398,9 @@ void ggml_cl_init(void) {
     free(platforms);
     free(devices);
 
-    program = build_program_from_source(context, device, clblast_dequant);
+    std::string kernel_src = generate_kernels();
+
+    program = build_program_from_source(context, device, kernel_src.c_str());
 
     // FP16 to FP32 kernel
     convert_fp16_to_fp32_cl = clCreateKernel(program, "convert_fp16_to_fp32", &err);