From 8795403de333abebf4aa8c0642a12f05f2d652a5 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 14 May 2023 21:14:05 +0200 Subject: [PATCH] Fix bugs in dequant_mul_mat code --- ggml-opencl.cpp | 31 +++++++++++++++---------------- llama.cpp | 2 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index d99ceeef0..d391e95fc 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -155,8 +155,8 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* } __kernel void dequantize_mul_mat_vec(__global struct block_q4_0* x, __local float* tmp, __global float* y, __global float* dst, int ncols) { - const int row = get_global_id(0); - const int tid = get_local_id(0); + const int row = get_local_id(0); + const int tid = get_global_id(0); const int block_size = get_local_size(0); const uint qk = QK4_0; @@ -173,7 +173,6 @@ __kernel void dequantize_mul_mat_vec(__global struct block_q4_0* x, __local floa const int iybs = col - col%qk; // y block start index // dequantize - float v0, v1; const float d = x[ib].d; const uint8_t vui = x[ib].qs[iqs]; @@ -181,8 +180,8 @@ __kernel void dequantize_mul_mat_vec(__global struct block_q4_0* x, __local floa const int8_t vi0 = vui & 0xF; const int8_t vi1 = vui >> 4; - v0 = (vi0 - 8)*d; - v1 = (vi1 - 8)*d; + float v0 = (vi0 - 8)*d; + float v1 = (vi1 - 8)*d; // matrix multiplication tmp[tid] += v0 * y[iybs + iqs + 0]; @@ -651,41 +650,40 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - cl_event ev_Q, ev_sgemm; + cl_event ev_sgemm; // copy src0 to device if necessary if (src0->backend == GGML_BACKEND_CPU) { - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, &ev_Q), "ggml_cl_h2d_tensor_2d"); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); } else if (src0->backend == GGML_BACKEND_CL) { - d_Q = * (cl_mem *) src0->data; + d_Q = *(cl_mem*) src0->data; } else { GGML_ASSERT(false); } if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel - printf("Gogogo\n"); // copy src1 to device CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); // compute // dequantize_mul_mat_vec(__global void * vx, __local float* tmp, __global float * y, __global float * dst, __global int ncols, __global int vx_type) { - const size_t global = ne00; + const size_t global = ne01; const size_t local = CL_DMMV_BLOCK_SIZE; - const cl_int ncols = ne01; - const cl_int qtype = src0->type; + const cl_int ncols = ne00; CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 1, sizeof(float) * local, NULL), "clSetKernelArg"); CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 2, sizeof(cl_mem), &d_Y), "clSetKernelArg"); CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 3, sizeof(cl_mem), &d_D), "clSetKernelArg"); CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 4, sizeof(cl_int), &ncols), "clSetKernelArg"); - CL_CHECK(clSetKernelArg(dequantize_mul_mat_vec_cl, 5, sizeof(cl_int), &qtype), "clSetKernelArg"); - CL_CHECK(clEnqueueNDRangeKernel(queue, dequantize_mul_mat_vec_cl, 1, NULL, &global, &local, 1, &ev_Q, &ev_sgemm), "clEnqueueNDRangeKernel"); + CL_CHECK(clFinish(queue), "clFinish"); + CL_CHECK(clEnqueueNDRangeKernel(queue, dequantize_mul_mat_vec_cl, 1, NULL, &global, &local, 0, NULL, &ev_sgemm), "clEnqueueNDRangeKernel"); } else { // general dequantization kernel + CLBlast matrix matrix multiplication // convert src0 to fp32 on device const size_t global = x_ne; - const size_t local = 16; + const size_t local = ggml_blck_size(type) / 2; CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q), "clSetKernelArg"); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X), "clSetKernelArg"); - CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, &local, 1, &ev_Q, NULL), "clEnqueueNDRangeKernel"); + CL_CHECK(clFinish(queue), "clFinish"); + CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, &local, 0, NULL, NULL), "clEnqueueNDRangeKernel"); // copy src1 to device CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); @@ -712,6 +710,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL), "clEnqueueReadBuffer"); + clReleaseEvent(ev_sgemm); } } diff --git a/llama.cpp b/llama.cpp index 0ffcb5505..1ec03468e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1075,7 +1075,7 @@ static void llama_model_load_internal( ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); } if (n_gpu_layers > (int) hparams.n_layer) { - fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); + fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__); ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); }