From c3db99ea323889d6f4bfb7004e2a4704e615183a Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Mon, 10 Apr 2023 09:49:40 +0200 Subject: [PATCH] Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing --- Makefile | 4 ++ ggml.c | 137 +++++++++++++++++++++++++++++++++++++++++++--- llama_adapter.cpp | 2 + 3 files changed, 134 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 5599f0258..5661d8bb2 100644 --- a/Makefile +++ b/Makefile @@ -96,6 +96,10 @@ ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas LDFLAGS += -lopenblas endif +ifdef LLAMA_CLBLAST + CFLAGS += -DGGML_USE_CLBLAST + LDFLAGS += -lclblast -lOpenCL +endif ifdef LLAMA_GPROF CFLAGS += -pg CXXFLAGS += -pg diff --git a/ggml.c b/ggml.c index 326b8e842..cef76053a 100644 --- a/ggml.c +++ b/ggml.c @@ -128,9 +128,25 @@ typedef void* thread_ret_t; } \ } while (0) +#if GGML_USE_CLBLAST +#ifndef GGML_USE_OPENBLAS +#define GGML_USE_OPENBLAS +#endif + +#define CL_TARGET_OPENCL_VERSION 110 +#include + +cl_platform_id platform; +cl_device_id device; +cl_context context; +cl_command_queue queue; +cl_event event; +bool cl_initialized = false; +#endif + #ifdef GGML_USE_ACCELERATE #include -#elif GGML_USE_OPENBLAS +#elif defined(GGML_USE_OPENBLAS) #include #endif @@ -6104,7 +6120,7 @@ static void ggml_compute_forward_rms_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -6129,6 +6145,85 @@ static bool ggml_compute_forward_mul_mat_use_blas( return false; } + +#ifdef GGML_USE_CLBLAST +static bool ggml_cl_sgemm_wrapper(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_TRANSPOSE trans_b, const int m, const int n, const int k, const float alpha, const float *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc) { + cl_int err = 0; + + if (!cl_initialized) { + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + platform = platforms[0]; + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + device = devices[0]; + context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); + if (err != CL_SUCCESS) { + printf("Error creating OpenCL context: %d\n", err); + fflush(stdout); + } + queue = clCreateCommandQueue(context, device, 0, &err); + event = NULL; + + if (err != CL_SUCCESS) { + printf("Error creating OpenCL Command Queue: %d\n", err); + fflush(stdout); + } + + free(platforms); + free(devices); + cl_initialized = true; + } + + // Prepare buffers + cl_mem cl_buffer_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*k*sizeof(float), NULL, &err); + if (err != CL_SUCCESS) { + printf("Error creating OpenCL Buffer A: %d\n", err); + fflush(stdout); + } + cl_mem cl_buffer_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*k*sizeof(float), NULL, &err); + if (err != CL_SUCCESS) { + printf("Error creating OpenCL Buffer B: %d\n", err); + fflush(stdout); + } + cl_mem cl_buffer_c = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(float), NULL, &err); + if (err != CL_SUCCESS) { + printf("Error creating OpenCL Buffer C: %d\n", err); + fflush(stdout); + } + + clEnqueueWriteBuffer(queue, cl_buffer_a, CL_TRUE, 0, m*k*sizeof(float), host_a, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, cl_buffer_b, CL_TRUE, 0, n*k*sizeof(float), host_b, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, cl_buffer_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL); + + // Call the SGEMM routine. + CLBlastStatusCode status = CLBlastSgemm(order, + trans_a, trans_b, + m, n, k, + alpha, + cl_buffer_a, 0, lda, + cl_buffer_b, 0, ldb, + beta, + cl_buffer_c, 0, ldc, + &queue, &event); + + // Wait for completion + if (status == CLBlastSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + + clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL); + + clReleaseMemObject(cl_buffer_a); + clReleaseMemObject(cl_buffer_b); + clReleaseMemObject(cl_buffer_c); +} +#endif #endif static void ggml_compute_forward_mul_mat_f32( @@ -6144,7 +6239,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -6201,7 +6296,7 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -6223,11 +6318,19 @@ static void ggml_compute_forward_mul_mat_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); // zT = y * xT +#ifdef GGML_USE_CLBLAST + ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01); +#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, 0.0f, d, ne01); +#endif } } @@ -6360,7 +6463,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -6395,11 +6498,19 @@ static void ggml_compute_forward_mul_mat_f16_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); // zT = y * xT +#ifdef GGML_USE_CLBLAST + ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01); +#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, 0.0f, d, ne01); +#endif } } @@ -6575,7 +6686,7 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -6608,11 +6719,19 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); // zT = y * xT +#ifdef GGML_USE_CLBLAST + ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne10, + 0.0f, d, ne01); +#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, 0.0f, d, ne01); +#endif } } @@ -9325,7 +9444,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -9342,7 +9461,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; } else if (quantize_fns[node->src0->type].vec_dot_q && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); @@ -10638,7 +10757,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) return 1; #else return 0; diff --git a/llama_adapter.cpp b/llama_adapter.cpp index 647eb91a6..ae873efa6 100644 --- a/llama_adapter.cpp +++ b/llama_adapter.cpp @@ -251,10 +251,12 @@ generation_outputs llama_generate(const generation_inputs inputs, generation_out last_n_tokens.push_back(embd_inp[input_consumed]); current_context_tokens.push_back(embd_inp[input_consumed]); ++input_consumed; +#ifndef GGML_USE_CLBLAST if ((int)embd.size() >= params.n_batch) { break; } +#endif } } }