Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing
This commit is contained in:
parent
8a0f8673ba
commit
a908c37ce9
2 changed files with 128 additions and 8 deletions
4
Makefile
4
Makefile
|
@ -113,6 +113,10 @@ ifdef LLAMA_CUBLAS
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif
|
endif
|
||||||
|
ifdef LLAMA_CLBLAST
|
||||||
|
CFLAGS += -DGGML_USE_CLBLAST
|
||||||
|
LDFLAGS += -lclblast -lOpenCL
|
||||||
|
endif
|
||||||
ifdef LLAMA_GPROF
|
ifdef LLAMA_GPROF
|
||||||
CFLAGS += -pg
|
CFLAGS += -pg
|
||||||
CXXFLAGS += -pg
|
CXXFLAGS += -pg
|
||||||
|
|
132
ggml.c
132
ggml.c
|
@ -143,6 +143,22 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#if GGML_USE_CLBLAST
|
||||||
|
#ifndef GGML_USE_OPENBLAS
|
||||||
|
#define GGML_USE_OPENBLAS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CL_TARGET_OPENCL_VERSION 110
|
||||||
|
#include <clblast_c.h>
|
||||||
|
|
||||||
|
cl_platform_id platform;
|
||||||
|
cl_device_id device;
|
||||||
|
cl_context context;
|
||||||
|
cl_command_queue queue;
|
||||||
|
cl_event event;
|
||||||
|
bool cl_initialized = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE)
|
#if defined(GGML_USE_ACCELERATE)
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#elif defined(GGML_USE_OPENBLAS)
|
#elif defined(GGML_USE_OPENBLAS)
|
||||||
|
@ -7422,7 +7438,7 @@ static void ggml_compute_forward_rms_norm(
|
||||||
|
|
||||||
// ggml_compute_forward_mul_mat
|
// ggml_compute_forward_mul_mat
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
// helper function to determine if it is better to use BLAS or not
|
// helper function to determine if it is better to use BLAS or not
|
||||||
// for large matrices, BLAS is faster
|
// for large matrices, BLAS is faster
|
||||||
static bool ggml_compute_forward_mul_mat_use_blas(
|
static bool ggml_compute_forward_mul_mat_use_blas(
|
||||||
|
@ -7447,6 +7463,85 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
static bool ggml_cl_sgemm_wrapper(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans_a, const enum CBLAS_TRANSPOSE trans_b, const int m, const int n, const int k, const float alpha, const float *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc) {
|
||||||
|
cl_int err = 0;
|
||||||
|
|
||||||
|
if (!cl_initialized) {
|
||||||
|
cl_uint num_platforms;
|
||||||
|
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||||
|
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
|
||||||
|
clGetPlatformIDs(num_platforms, platforms, NULL);
|
||||||
|
platform = platforms[0];
|
||||||
|
cl_uint num_devices;
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
|
||||||
|
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
|
||||||
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
|
||||||
|
device = devices[0];
|
||||||
|
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
printf("Error creating OpenCL context: %d\n", err);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
queue = clCreateCommandQueue(context, device, 0, &err);
|
||||||
|
event = NULL;
|
||||||
|
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
printf("Error creating OpenCL Command Queue: %d\n", err);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(platforms);
|
||||||
|
free(devices);
|
||||||
|
cl_initialized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare buffers
|
||||||
|
cl_mem cl_buffer_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*k*sizeof(float), NULL, &err);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
printf("Error creating OpenCL Buffer A: %d\n", err);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
cl_mem cl_buffer_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*k*sizeof(float), NULL, &err);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
printf("Error creating OpenCL Buffer B: %d\n", err);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
cl_mem cl_buffer_c = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(float), NULL, &err);
|
||||||
|
if (err != CL_SUCCESS) {
|
||||||
|
printf("Error creating OpenCL Buffer C: %d\n", err);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
clEnqueueWriteBuffer(queue, cl_buffer_a, CL_TRUE, 0, m*k*sizeof(float), host_a, 0, NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, cl_buffer_b, CL_TRUE, 0, n*k*sizeof(float), host_b, 0, NULL, NULL);
|
||||||
|
clEnqueueWriteBuffer(queue, cl_buffer_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
|
||||||
|
|
||||||
|
// Call the SGEMM routine.
|
||||||
|
CLBlastStatusCode status = CLBlastSgemm(order,
|
||||||
|
trans_a, trans_b,
|
||||||
|
m, n, k,
|
||||||
|
alpha,
|
||||||
|
cl_buffer_a, 0, lda,
|
||||||
|
cl_buffer_b, 0, ldb,
|
||||||
|
beta,
|
||||||
|
cl_buffer_c, 0, ldc,
|
||||||
|
&queue, &event);
|
||||||
|
|
||||||
|
// Wait for completion
|
||||||
|
if (status == CLBlastSuccess) {
|
||||||
|
clWaitForEvents(1, &event);
|
||||||
|
clReleaseEvent(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
|
||||||
|
|
||||||
|
clReleaseMemObject(cl_buffer_a);
|
||||||
|
clReleaseMemObject(cl_buffer_b);
|
||||||
|
clReleaseMemObject(cl_buffer_c);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_compute_forward_mul_mat_f32(
|
static void ggml_compute_forward_mul_mat_f32(
|
||||||
|
@ -7462,7 +7557,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
const int64_t ne03 = src0->ne[3];
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
#endif
|
#endif
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
@ -7519,7 +7614,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return;
|
return;
|
||||||
|
@ -7570,6 +7665,13 @@ static void ggml_compute_forward_mul_mat_f32(
|
||||||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
||||||
#else
|
#else
|
||||||
// zT = y * xT
|
// zT = y * xT
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
|
ne11, ne01, ne10,
|
||||||
|
1.0f, y, ne10,
|
||||||
|
x, ne10,
|
||||||
|
0.0f, d, ne01);
|
||||||
|
#else
|
||||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
ne11, ne01, ne10,
|
ne11, ne01, ne10,
|
||||||
1.0f, y, ne10,
|
1.0f, y, ne10,
|
||||||
|
@ -7713,7 +7815,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
|
@ -7797,6 +7899,13 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||||
|
|
||||||
// zT = y * xT
|
// zT = y * xT
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
|
ne11, ne01, ne10,
|
||||||
|
1.0f, y, ne10,
|
||||||
|
x, ne10,
|
||||||
|
0.0f, d, ne01);
|
||||||
|
#else
|
||||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
ne11, ne01, ne10,
|
ne11, ne01, ne10,
|
||||||
1.0f, y, ne10,
|
1.0f, y, ne10,
|
||||||
|
@ -7963,7 +8072,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||||
if (params->ith != 0) {
|
if (params->ith != 0) {
|
||||||
return;
|
return;
|
||||||
|
@ -8053,6 +8162,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
||||||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
||||||
#else
|
#else
|
||||||
// zT = y * xT
|
// zT = y * xT
|
||||||
|
#ifdef GGML_USE_CLBLAST
|
||||||
|
ggml_cl_sgemm_wrapper(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
|
ne11, ne01, ne10,
|
||||||
|
1.0f, y, ne10,
|
||||||
|
x, ne10,
|
||||||
|
0.0f, d, ne01);
|
||||||
|
#else
|
||||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||||
ne11, ne01, ne10,
|
ne11, ne01, ne10,
|
||||||
1.0f, y, ne10,
|
1.0f, y, ne10,
|
||||||
|
@ -10885,7 +11001,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
|
|
||||||
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||||
// the threads are still spinning
|
// the threads are still spinning
|
||||||
|
@ -10902,7 +11018,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
||||||
cur = 0;
|
cur = 0;
|
||||||
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1;
|
node->n_tasks = 1;
|
||||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
||||||
|
@ -12303,7 +12419,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_blas(void) {
|
int ggml_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue