Merge commit 'ebc5d0651a' into concedo_experimental

# Conflicts:
#	ggml-opencl.cpp
This commit is contained in:
Concedo 2023-05-29 16:26:24 +08:00
commit 254a9ff12c

View file

@ -3,6 +3,7 @@
#include <array> #include <array>
#include <atomic> #include <atomic>
#include <sstream> #include <sstream>
#include <vector>
#define CL_TARGET_OPENCL_VERSION 110 #define CL_TARGET_OPENCL_VERSION 110
#include <clblast.h> #include <clblast.h>
@ -877,11 +878,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i02 = 0; i02 < ne02; i02++) {
cl_event ev_sgemm; size_t ev_idx = 0;
std::vector<cl_event> events;
// copy src0 to device if necessary // copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_CPU) { if (src0->backend == GGML_BACKEND_CPU) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL)); events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
} else if (src0->backend == GGML_BACKEND_CL) { } else if (src0->backend == GGML_BACKEND_CL) {
d_Q = *(cl_mem*) src0->data; d_Q = *(cl_mem*) src0->data;
} else { } else {
@ -889,30 +892,32 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
} }
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
// copy src1 to device // copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
// compute // compute
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
const size_t local = CL_DMMV_BLOCK_SIZE; const size_t local = CL_DMMV_BLOCK_SIZE;
const cl_int ncols = ne00; const cl_int ncols = ne00;
events.emplace_back();
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y)); CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D)); CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols)); CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
CL_CHECK(clFinish(queue)); CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
} else { // general dequantization kernel + CLBlast matrix matrix multiplication } else { // general dequantization kernel + CLBlast matrix matrix multiplication
// convert src0 to fp32 on device // convert src0 to fp32 on device
const size_t global = x_ne; const size_t global = x_ne;
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
CL_CHECK(clFinish(queue)); CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
// copy src1 to device // copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
events.emplace_back();
// wait for conversion // wait for conversion
CL_CHECK(clFinish(queue)); CL_CHECK(clFinish(queue));
@ -925,7 +930,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
d_Y, 0, ne10, d_Y, 0, ne10,
beta, beta,
d_D, 0, ne01, d_D, 0, ne01,
&queue, &ev_sgemm); &queue, events.data() + ev_idx++);
if (status != clblast::StatusCode::kSuccess) { if (status != clblast::StatusCode::kSuccess) {
printf("\nQF32 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11); printf("\nQF32 Matmul Failed (%d): [dims: %lld,%lld,%lld,%lld] You may be out of VRAM. Please check if you have enough.\n",status,ne00,ne01,ne10,ne11);
@ -935,8 +940,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
// copy dst to host // copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
clReleaseEvent(ev_sgemm); for (auto *event : events) {
clReleaseEvent(event);
}
} }
} }