diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 667f55e79..91abf5268 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #define CL_TARGET_OPENCL_VERSION 110 #include @@ -867,11 +868,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - cl_event ev_sgemm; + size_t ev_idx = 0; + std::vector events; // copy src0 to device if necessary if (src0->backend == GGML_BACKEND_CPU) { - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL)); + events.emplace_back(); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); } else if (src0->backend == GGML_BACKEND_CL) { d_Q = *(cl_mem*) src0->data; } else { @@ -879,30 +882,32 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + events.emplace_back(); + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++)); // compute const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; const size_t local = CL_DMMV_BLOCK_SIZE; const cl_int ncols = ne00; + events.emplace_back(); CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y)); CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D)); CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols)); - CL_CHECK(clFinish(queue)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm)); + CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); } else { // general dequantization kernel + CLBlast matrix matrix multiplication // convert src0 to fp32 on device const size_t global = x_ne; CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); - CL_CHECK(clFinish(queue)); - CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); // copy src1 to device CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); + events.emplace_back(); + // wait for conversion CL_CHECK(clFinish(queue)); @@ -915,7 +920,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * d_Y, 0, ne10, beta, d_D, 0, ne01, - &queue, &ev_sgemm); + &queue, events.data() + ev_idx++); if (status != clblast::StatusCode::kSuccess) { GGML_ASSERT(false); @@ -924,8 +929,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * // copy dst to host float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); - clReleaseEvent(ev_sgemm); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); + for (auto *event : events) { + clReleaseEvent(event); + } } }