From 2b0c6a56f9ea7c690520ade1897bffd09dfb33de Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 26 Apr 2023 07:48:04 +0200 Subject: [PATCH] Improve code quality * Move internal stuff out of header * Use internal enums instead of CLBlast enums * Remove leftover C++ includes and defines * Make event use easier to read Co-authored-by: Henri Vasserman --- CMakeLists.txt | 2 +- Makefile | 2 +- ggml-opencl.cpp => ggml-opencl.c | 66 ++++++++++++++++++++------------ ggml-opencl.h | 31 ++++++--------- ggml.c | 6 +-- 5 files changed, 59 insertions(+), 48 deletions(-) rename ggml-opencl.cpp => ggml-opencl.c (75%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 79dfd070b..5fdbeddfc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,7 +174,7 @@ if (LLAMA_CLBLAST) if (CLBlast_FOUND) message(STATUS "CLBlast found") - set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h) + set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h) add_compile_definitions(GGML_USE_CLBLAST) diff --git a/Makefile b/Makefile index 9b9c8ee7a..b056a01d0 100644 --- a/Makefile +++ b/Makefile @@ -117,7 +117,7 @@ ifdef LLAMA_CLBLAST CFLAGS += -DGGML_USE_CLBLAST LDFLAGS += -lclblast -lOpenCL OBJS += ggml-opencl.o -ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h +ggml-opencl.o: ggml-opencl.c ggml-opencl.h $(CXX) $(CXXFLAGS) -c $< -o $@ endif ifdef LLAMA_GPROF diff --git a/ggml-opencl.cpp b/ggml-opencl.c similarity index 75% rename from ggml-opencl.cpp rename to ggml-opencl.c index aa426fe3f..08e8df811 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.c @@ -1,13 +1,24 @@ #include "ggml-opencl.h" -#include -#include -#include +#define CL_TARGET_OPENCL_VERSION 110 +#include + +#include +#include #include "ggml.h" #include +#define CL_CHECK(err, name) \ + do { \ + cl_int err_ = (err); \ + if (err_ != CL_SUCCESS) { \ + fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + cl_platform_id platform; cl_device_id device; cl_context context; @@ -74,7 +85,7 @@ void ggml_cl_init(void) { printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); CL_CHECK(err, "clCreateContext"); - queue = clCreateCommandQueue(context, device, 0, &err); + queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); CL_CHECK(err, "clCreateCommandQueue"); free(platforms); @@ -93,7 +104,7 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); } -void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { +static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { if (req_size <= *cur_size) { return; } @@ -108,11 +119,14 @@ void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_me CL_CHECK(err, "clCreateBuffer"); } -void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose trans_a, const CLBlastTranspose trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype) { +void ggml_cl_sgemm_wrapper( + const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, + const int m, const int n, const int k, + const float alpha, const void *host_a, const int lda, + const float *host_b, const int ldb, const float beta, + float *host_c, const int ldc, const int btype) { cl_int err = 0; - cl_event events[4] = { NULL }; - cl_kernel kernel; size_t global = n * k, local, size_qb; bool dequant; @@ -162,42 +176,46 @@ void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose tra ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); + cl_event ev_a, ev_qb, ev_b; + if (dequant) { err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); CL_CHECK(err, "clSetKernelArg"); - clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, events + 1); + clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); } else { - clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, events + 1); + clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); } - clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, events); + clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); if (dequant) { - err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, events + 1, events + 3); + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); CL_CHECK(err, "clEnqueueNDRangeKernel"); } - clWaitForEvents(dequant ? 4 : 3, events); - clReleaseEvent(events[0]); - clReleaseEvent(events[1]); - clReleaseEvent(events[2]); + clWaitForEvents(1, &ev_a); + clWaitForEvents(1, &ev_b); + clReleaseEvent(ev_a); + clReleaseEvent(ev_b); if (dequant) { - clReleaseEvent(events[3]); + clReleaseEvent(ev_qb); } - CLBlastSgemm(order, - trans_a, trans_b, + cl_event ev_sgemm; + CLBlastSgemm((CLBlastLayout)order, + (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, m, n, k, alpha, cl_buffer_a, 0, lda, cl_buffer_b, 0, ldb, beta, cl_buffer_c, 0, ldc, - &queue, events); + &queue, &ev_sgemm); - clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, events, events + 1); + cl_event ev_c; + clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); // Wait for completion - clWaitForEvents(2, events); - clReleaseEvent(events[0]); - clReleaseEvent(events[1]); + clWaitForEvents(1, &ev_c); + clReleaseEvent(ev_sgemm); + clReleaseEvent(ev_c); } diff --git a/ggml-opencl.h b/ggml-opencl.h index 9d7f91173..7bcc603ef 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -1,30 +1,23 @@ #pragma once -#define CL_TARGET_OPENCL_VERSION 110 -#include -#define MAX_CL_BUFFERS 16 - #ifdef __cplusplus extern "C" { #endif -// Buffer reuse code adapted from cuda implementation by slaren -#define CL_CHECK(err, name) \ - do { \ - cl_int err_ = (err); \ - if (err_ != CL_SUCCESS) { \ - fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ - exit(1); \ - } \ - } while (0) - -cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size); -void ggml_cl_pool_free(cl_mem mem, size_t size); - -cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer); void ggml_cl_init(void); -void ggml_cl_sgemm_wrapper(const CLBlastLayout order, const CLBlastTranspose trans_a, const CLBlastTranspose trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); +enum ggml_blas_order { + GGML_BLAS_ORDER_ROW_MAJOR = 101, + GGML_BLAS_ORDER_COLUMN_MAJOR = 102, +}; + +enum ggml_blas_op { + GGML_BLAS_OP_N = 111, + GGML_BLAS_OP_T = 112, + GGML_BLAS_OP_C = 113, +}; + +void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); #ifdef __cplusplus } diff --git a/ggml.c b/ggml.c index 4bc86110c..1576792b0 100644 --- a/ggml.c +++ b/ggml.c @@ -7575,7 +7575,7 @@ static void ggml_compute_forward_mul_mat_f32( CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); #elif defined(GGML_USE_CLBLAST) // zT = y * xT - ggml_cl_sgemm_wrapper(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeYes, + ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, @@ -7809,7 +7809,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); // zT = y * xT - ggml_cl_sgemm_wrapper(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeYes, + ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, @@ -8080,7 +8080,7 @@ static void ggml_compute_forward_mul_mat_q_f32( CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); #elif defined(GGML_USE_CLBLAST) // zT = y * xT - ggml_cl_sgemm_wrapper(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeYes, + ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10,