Reduce queueing overhead for contiguous tensors by using single mul kernel call

2023-05-30 18:49:53 +02:00 · 2023-05-30 18:49:53 +02:00 · ac6b49ed45
commit ac6b49ed45
parent 97c5cca4e5
1 changed files with 30 additions and 7 deletions
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -691,6 +691,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
+    const int64_t nb10 = src1->nb[0];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    size_t x_size;
@ -709,6 +710,27 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
            // copy src0 to device
            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));

+            if (nb10 == sizeof(float)) {
+                // Contiguous, avoid overhead from queueing many kernel runs
+                const int64_t i13 = i03%ne13;
+                const int64_t i12 = i02%ne12;
+                const int i1 = i13*ne12*ne11 + i12*ne11;
+
+                cl_int x_offset = 0;
+                cl_int y_offset = i1*ne10;
+                cl_int d_offset = 0;
+
+                size_t global = ne00 * ne01;
+                cl_int ky = ne10;
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
+                CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+            } else {
                for (int64_t i01 = 0; i01 < ne01; i01++) {
                    const int64_t i13 = i03%ne13;
                    const int64_t i12 = i02%ne12;
@ -731,6 +753,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
                    CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
                    CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
                }
+            }

            CL_CHECK(clReleaseEvent(ev));
            CL_CHECK(clFinish(queue));