From b3315459c7c87d85ab9f65c9c3d2de3b163417e6 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 30 Apr 2023 14:15:44 +0800 Subject: [PATCH] pilled the new dequants for clblast, fixed some ooms --- Makefile | 2 +- ggml-opencl-dequant.cl | 84 ------------- ggml-opencl.c | 269 ++++++++++++++++++++++++++++++++++++----- otherarch/gpt2_v2.cpp | 2 +- otherarch/gptj_v2.cpp | 2 +- otherarch/neox.cpp | 2 +- 6 files changed, 242 insertions(+), 119 deletions(-) delete mode 100644 ggml-opencl-dequant.cl diff --git a/Makefile b/Makefile index b682aeea9..812c5c753 100644 --- a/Makefile +++ b/Makefile @@ -236,7 +236,7 @@ ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h ggml_rwkv.o: otherarch/ggml_rwkv.c otherarch/ggml_rwkv.h $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@ -llama.o: llama.cpp llama.h llama_util.h +llama.o: llama.cpp llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl deleted file mode 100644 index 191b2e575..000000000 --- a/ggml-opencl-dequant.cl +++ /dev/null @@ -1,84 +0,0 @@ -#define MULTILINE_QUOTE(...) #__VA_ARGS__ -const char * clblast_dequant = MULTILINE_QUOTE( - -struct block_q4_0 -{ - float d; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -struct block_q4_1 -{ - float d; - float m; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - const float m = blocks[i].m; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - -struct block_q4_2 -{ - ushort d; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &blocks[i].d);; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -struct block_q4_3 -{ - ushort d; - ushort m; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &(blocks[i].d)); - const float m = vload_half(0, (__global half*) &(blocks[i].m)); - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - -); diff --git a/ggml-opencl.c b/ggml-opencl.c index dc8e03e64..90ebe4867 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -8,7 +8,155 @@ #include "ggml.h" -#include "ggml-opencl-dequant.cl" +#define MULTILINE_QUOTE(...) #__VA_ARGS__ +const char * clblast_dequant = MULTILINE_QUOTE( + +struct block_q4_0 +{ + float d; + uchar qs[16]; +}; + +__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*32 + l*2; + result[index + 0] = ((vi & 0xf) - 8)*d; + result[index + 1] = ((vi >> 4) - 8)*d; +} + +struct block_q4_1 +{ + float d; + float m; + uchar qs[16]; +}; + +__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + const float m = blocks[i].m; + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*32 + l*2; + result[index + 0] = (vi & 0xf) * d + m; + result[index + 1] = (vi >> 4) * d + m; +} + +struct block_q4_2 +{ + ushort d; + uchar qs[8]; +}; + +__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { + const uint i = get_global_id(0) / 16; + const uint l = get_local_id(0); + + const float d = vload_half(0, (__global half*) &blocks[i].d); + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*16 + l*2; + result[index + 0] = ((vi & 0xf) - 8)*d; + result[index + 1] = ((vi >> 4) - 8)*d; +} + +struct block_q4_3 +{ + ushort d; + ushort m; + uchar qs[8]; +}; + +__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) { + const uint i = get_global_id(0) / 16; + const uint l = get_local_id(0); + + const float d = vload_half(0, (__global half*) &(blocks[i].d)); + const float m = vload_half(0, (__global half*) &(blocks[i].m)); + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*16 + l*2; + result[index + 0] = (vi & 0xf) * d + m; + result[index + 1] = (vi >> 4) * d + m; +} + +struct block_q5_0 +{ + float d; + uint qh; + uchar qs[16]; +}; + +__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + + const uchar vi = blocks[i].qs[l]; + + const uint l2 = l * 2; + + const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; + const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; + + const uint index = i*32 + l2; + result[index + 0] = (((vi & 0xf) | vh0) - 16)*d; + result[index + 1] = (((vi >> 4) | vh1) - 16)*d; +} + +struct block_q5_1 +{ + ushort d; + ushort m; + uint qh; + uchar qs[16]; +}; + +__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = vload_half(0, (__global half*) &blocks[i].d); + const float m = vload_half(0, (__global half*) &blocks[i].m); + + const uchar vi = blocks[i].qs[l]; + + const uint l2 = l * 2; + + const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; + const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; + + const uint index = i*32 + l2; + result[index + 0] = ((vi & 0xf) | vh0)*d + m; + result[index + 1] = ((vi >> 4) | vh1)*d + m; +} + +struct block_q8_0 +{ + float d; + char qs[32]; +}; + +__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + result[i*32 + l] = blocks[i].qs[l] * blocks[i].d; +} + +); #define CL_CHECK(err, name) \ do { \ @@ -19,12 +167,26 @@ } \ } while (0) +#define QK5_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; + + +typedef struct { + float d; // delta + uint32_t qh; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} cl_block_q5_0; + static cl_platform_id platform; static cl_device_id device; static cl_context context; static cl_command_queue queue; static cl_program program; -static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3; +static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3, kernel_q5_0, kernel_q5_1, kernel_q8_0; static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; @@ -57,21 +219,6 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co return p; } -static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { - if (req_size <= *cur_size) { - return; - } - - // Reallocate buffer with enough space - if (*cur_size > 0) { - clReleaseMemObject(*buf); - } - cl_int err; - *buf = clCreateBuffer(context, flags, req_size, NULL, &err); - *cur_size = req_size; - CL_CHECK(err, "clCreateBuffer"); -} - void ggml_cl_init(void) { cl_int err = 0; char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); @@ -114,7 +261,27 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err); CL_CHECK(err, "clCreateKernel"); + kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err); + CL_CHECK(err, "clCreateKernel"); + kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err); + CL_CHECK(err, "clCreateKernel"); + kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err); + CL_CHECK(err, "clCreateKernel"); +} +static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { + if (req_size <= *cur_size) { + return; + } + + // Reallocate buffer with enough space + if (*cur_size > 0) { + clReleaseMemObject(*buf); + } + cl_int err; + *buf = clCreateBuffer(context, flags, req_size, NULL, &err); + *cur_size = req_size; + CL_CHECK(err, "clCreateBuffer"); } void ggml_cl_sgemm_wrapper( @@ -128,6 +295,7 @@ void ggml_cl_sgemm_wrapper( cl_kernel kernel; size_t global = n * k, local, size_qb; bool dequant; + cl_block_q5_0* cl_host_b; switch (btype) { case GGML_TYPE_F32: @@ -149,7 +317,7 @@ void ggml_cl_sgemm_wrapper( dequant = true; kernel = kernel_q4_2; local = 8; - size_qb = global * (sizeof(short) + local) / 16; + size_qb = global * (sizeof(ggml_fp16_t) + local) / 16; break; case GGML_TYPE_Q4_3: dequant = true; @@ -157,6 +325,34 @@ void ggml_cl_sgemm_wrapper( local = 8; size_qb = global * (sizeof(short) * 2 + local) / 16; break; + case GGML_TYPE_Q5_0: + dequant = true; + kernel = kernel_q5_0; + local = 16; + // For some reason OpenCL seems to be incapable of working with structs of size 22. + // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU... + // TODO Find the reason, fix and remove workaround. + const block_q5_0* b = (const block_q5_0*) host_b; + cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32); + for (size_t i = 0; i < global / 32; i++) { + cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d); + memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t) + QK5_0 / 2); + } + host_b = (const float*) cl_host_b; + size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32; + break; + case GGML_TYPE_Q5_1: + dequant = true; + kernel = kernel_q5_1; + local = 16; + size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; + break; + case GGML_TYPE_Q8_0: + dequant = true; + kernel = kernel_q8_0; + local = 32; + size_qb = global * (sizeof(float) + local) / 32; + break; default: fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); abort(); @@ -180,12 +376,15 @@ void ggml_cl_sgemm_wrapper( err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); CL_CHECK(err, "clSetKernelArg"); - clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); + err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); + CL_CHECK(err, "clEnqueueWriteBuffer qb"); } else { - clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); + err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); + CL_CHECK(err, "clEnqueueWriteBuffer b"); } - clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); + err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); + CL_CHECK(err, "clEnqueueWriteBuffer a"); if (dequant) { err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); CL_CHECK(err, "clEnqueueNDRangeKernel"); @@ -197,15 +396,20 @@ void ggml_cl_sgemm_wrapper( clReleaseEvent(ev_b); cl_event ev_sgemm; - CLBlastSgemm((CLBlastLayout)order, - (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, - m, n, k, - alpha, - cl_buffer_a, 0, lda, - cl_buffer_b, 0, ldb, - beta, - cl_buffer_c, 0, ldc, - &queue, &ev_sgemm); + CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order, + (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, + m, n, k, + alpha, + cl_buffer_a, 0, lda, + cl_buffer_b, 0, ldb, + beta, + cl_buffer_c, 0, ldc, + &queue, &ev_sgemm); + + if (status != CLBlastSuccess) { + fprintf(stderr, "Error: CLBlast SGEMM %d\n", status); + abort(); + } cl_event ev_c; clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); @@ -214,4 +418,7 @@ void ggml_cl_sgemm_wrapper( clWaitForEvents(1, &ev_c); clReleaseEvent(ev_sgemm); clReleaseEvent(ev_c); -} + if (btype == GGML_TYPE_Q5_0) { + free((void*) cl_host_b); + } +} \ No newline at end of file diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 9b8c0baa5..a62b6e1d5 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -350,7 +350,7 @@ bool gpt2_eval( static size_t buf_size = 256u*1024*1024; static void * buf = malloc(buf_size); - if (mem_per_token > 0 && mem_per_token*N*1.9 > buf_size) { + if (mem_per_token > 0 && (mem_per_token*N*2 + 16u*1024*1024) > buf_size) { const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 11c53d141..f697a77a1 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -362,7 +362,7 @@ bool gptj_eval( static size_t buf_size = 256u*1024*1024; static void * buf = malloc(buf_size); - if (mem_per_token > 0 && mem_per_token*N*1.9 > buf_size) { + if (mem_per_token > 0 && (mem_per_token*N*2 + 16u*1024*1024) > buf_size) { const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp index 32e5b1463..47397c309 100644 --- a/otherarch/neox.cpp +++ b/otherarch/neox.cpp @@ -360,7 +360,7 @@ bool stablelm_eval( static size_t buf_size = 256u*1024*1024; static void * buf = malloc(buf_size); - if (mem_per_token > 0 && mem_per_token*N*1.9 > buf_size) { + if (mem_per_token > 0 && (mem_per_token*N*2 + 16u*1024*1024) > buf_size) { const size_t buf_size_new = 360u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);