From c3db99ea323889d6f4bfb7004e2a4704e615183a Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Mon, 10 Apr 2023 09:49:40 +0200
Subject: [PATCH] Allow use of OpenCL GPU-based BLAS using ClBlast instead of
 OpenBLAS for context processing

---
 Makefile          |   4 ++
 ggml.c            | 137 +++++++++++++++++++++++++++++++++++++++++++---
 llama_adapter.cpp |   2 +
 3 files changed, 134 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 5599f0258..5661d8bb2 100644
--- a/Makefile
+++ b/Makefile
@@ -96,6 +96,10 @@ ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
+ifdef LLAMA_CLBLAST
+	CFLAGS  += -DGGML_USE_CLBLAST
+	LDFLAGS += -lclblast -lOpenCL
+endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
diff --git a/ggml.c b/ggml.c
index 326b8e842..cef76053a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -128,9 +128,25 @@ typedef void* thread_ret_t;
         } \
     } while (0)
 
+#if GGML_USE_CLBLAST
+#ifndef GGML_USE_OPENBLAS
+#define GGML_USE_OPENBLAS
+#endif
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast_c.h>
+
+cl_platform_id platform;
+cl_device_id device;
+cl_context context;
+cl_command_queue queue;
+cl_event event;
+bool cl_initialized = false;
+#endif
+
 #ifdef GGML_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
-#elif GGML_USE_OPENBLAS
+#elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
 #endif
 
@@ -6104,7 +6120,7 @@ static void ggml_compute_forward_rms_norm(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@@ -6129,6 +6145,85 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 
     return false;
 }
+
+#ifdef GGML_USE_CLBLAST
+static bool ggml_cl_sgemm_wrapper(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_TRANSPOSE trans_b, const int m, const int n, const int k, const float alpha, const float *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc) {
+    cl_int err = 0;
+
+    if (!cl_initialized) {
+        cl_uint num_platforms;
+        clGetPlatformIDs(0, NULL, &num_platforms);
+        cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+        clGetPlatformIDs(num_platforms, platforms, NULL);
+        platform = platforms[0];
+        cl_uint num_devices;
+        clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+        cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+        clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+        device = devices[0];
+        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+        if (err != CL_SUCCESS) {
+            printf("Error creating OpenCL context: %d\n", err);
+            fflush(stdout);
+        }
+        queue = clCreateCommandQueue(context, device, 0, &err);
+        event = NULL;
+
+        if (err != CL_SUCCESS) {
+            printf("Error creating OpenCL Command Queue: %d\n", err);
+            fflush(stdout);
+        }
+
+        free(platforms);
+        free(devices);
+        cl_initialized = true;
+    }
+
+    // Prepare buffers
+    cl_mem cl_buffer_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*k*sizeof(float), NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf("Error creating OpenCL Buffer A: %d\n", err);
+        fflush(stdout);
+    }
+    cl_mem cl_buffer_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*k*sizeof(float), NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf("Error creating OpenCL Buffer B: %d\n", err);
+        fflush(stdout);
+    }
+    cl_mem cl_buffer_c = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(float), NULL, &err);
+    if (err != CL_SUCCESS) {
+        printf("Error creating OpenCL Buffer C: %d\n", err);
+        fflush(stdout);
+    }
+
+    clEnqueueWriteBuffer(queue, cl_buffer_a, CL_TRUE, 0, m*k*sizeof(float), host_a, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, cl_buffer_b, CL_TRUE, 0, n*k*sizeof(float), host_b, 0, NULL, NULL);
+    clEnqueueWriteBuffer(queue, cl_buffer_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
+
+    // Call the SGEMM routine.
+    CLBlastStatusCode status = CLBlastSgemm(order,
+                                            trans_a, trans_b,
+                                            m, n, k,
+                                            alpha,
+                                            cl_buffer_a, 0, lda,
+                                            cl_buffer_b, 0, ldb,
+                                            beta,
+                                            cl_buffer_c, 0, ldc,
+                                            &queue, &event);
+
+    // Wait for completion
+    if (status == CLBlastSuccess) {
+      clWaitForEvents(1, &event);
+      clReleaseEvent(event);
+    }
+
+    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
+
+    clReleaseMemObject(cl_buffer_a);
+    clReleaseMemObject(cl_buffer_b);
+    clReleaseMemObject(cl_buffer_c);
+}
+#endif
 #endif
 
 static void ggml_compute_forward_mul_mat_f32(
@@ -6144,7 +6239,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     const int64_t ne10 = src1->ne[0];
 #endif
     const int64_t ne11 = src1->ne[1];
@@ -6201,7 +6296,7 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -6223,11 +6318,19 @@ static void ggml_compute_forward_mul_mat_f32(
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
                 // zT = y * xT
+#ifdef GGML_USE_CLBLAST
+                ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne10,
+                        0.0f,    d, ne01);
+#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne10,
                         0.0f,    d, ne01);
+#endif
             }
         }
 
@@ -6360,7 +6463,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
 
@@ -6395,11 +6498,19 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
                 // zT = y * xT
+#ifdef GGML_USE_CLBLAST
+                ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne10,
+                        0.0f,    d, ne01);
+#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne10,
                         0.0f,    d, ne01);
+#endif
             }
         }
 
@@ -6575,7 +6686,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -6608,11 +6719,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
                 // zT = y * xT
+#ifdef GGML_USE_CLBLAST
+                ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne10,
+                        0.0f,    d, ne01);
+#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne10,
                         0.0f,    d, ne01);
+#endif
             }
         }
 
@@ -9325,7 +9444,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         size_t cur = 0;
 
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
@@ -9342,7 +9461,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
                         } else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -10638,7 +10757,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     return 1;
 #else
     return 0;
diff --git a/llama_adapter.cpp b/llama_adapter.cpp
index 647eb91a6..ae873efa6 100644
--- a/llama_adapter.cpp
+++ b/llama_adapter.cpp
@@ -251,10 +251,12 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out
                 last_n_tokens.push_back(embd_inp[input_consumed]);
                 current_context_tokens.push_back(embd_inp[input_consumed]);
                 ++input_consumed;
+#ifndef GGML_USE_CLBLAST
                 if ((int)embd.size() >= params.n_batch)
                 {
                     break;
                 }
+#endif
             }
         }
     }