From ac6b49ed45b959e75f6ec7432fb6a5a2dc88cc4e Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Tue, 30 May 2023 18:49:53 +0200 Subject: [PATCH] Reduce queueing overhead for contiguous tensors by using single mul kernel call --- ggml-opencl.cpp | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 974dab432..197868863 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -691,6 +691,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, const int64_t ne11 = src1->ne[1]; const int64_t ne12 = src1->ne[2]; const int64_t ne13 = src1->ne[3]; + const int64_t nb10 = src1->nb[0]; const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; size_t x_size; @@ -709,18 +710,17 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, // copy src0 to device CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev)); - for (int64_t i01 = 0; i01 < ne01; i01++) { + if (nb10 == sizeof(float)) { + // Contiguous, avoid overhead from queueing many kernel runs const int64_t i13 = i03%ne13; const int64_t i12 = i02%ne12; - const int64_t i11 = i01%ne11; - const int i1 = i13*ne12*ne11 + i12*ne11 + i11; + const int i1 = i13*ne12*ne11 + i12*ne11; - cl_int x_offset = i01*ne00; + cl_int x_offset = 0; cl_int y_offset = i1*ne10; - cl_int d_offset = i01*ne00; + cl_int d_offset = 0; - // compute - size_t global = ne00; + size_t global = ne00 * ne01; cl_int ky = ne10; CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X)); CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset)); @@ -730,6 +730,29 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset)); CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky)); CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); + } else { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const int64_t i13 = i03%ne13; + const int64_t i12 = i02%ne12; + const int64_t i11 = i01%ne11; + const int i1 = i13*ne12*ne11 + i12*ne11 + i11; + + cl_int x_offset = i01*ne00; + cl_int y_offset = i1*ne10; + cl_int d_offset = i01*ne00; + + // compute + size_t global = ne00; + cl_int ky = ne10; + CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset)); + CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky)); + CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); + } } CL_CHECK(clReleaseEvent(ev));