opencl: use cl_ulong for sizes and strides

This commit is contained in:
Max Krasnyansky 2024-12-07 18:02:15 -08:00
parent c21fc8c5f9
commit 9a9d92b0b9
2 changed files with 469 additions and 475 deletions

View file

@ -245,7 +245,6 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
if(err < 0) { if(err < 0) {
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1); program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0'; program_log[log_size] = '\0';
@ -1953,14 +1952,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
GGML_ASSERT(dst->extra); GGML_ASSERT(dst->extra);
const int ne00 = src0 ? src0->ne[0] : 0; const int ne00 = src0 ? src0->ne[0] : 0;
const int nb01 = src0 ? src0->nb[1] : 0; const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const int nb02 = src0 ? src0->nb[2] : 0; const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const int ne10 = src1 ? src1->ne[0] : 0; const int ne10 = src1 ? src1->ne[0] : 0;
const int nb10 = src1 ? src1->nb[0] : 0; const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0; const int ne11 = src1 ? src1->ne[1] : 0;
const int nb11 = src1 ? src1->nb[1] : 0; const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const int nb1 = dst ? dst->nb[1] : 0; const cl_ulong nb1 = dst ? dst->nb[1] : 0;
const int nb2 = dst ? dst->nb[2] : 0; const cl_ulong nb2 = dst ? dst->nb[2] : 0;
ggml_backend_opencl2_context *backend_ctx = (ggml_backend_opencl2_context *)backend->context; ggml_backend_opencl2_context *backend_ctx = (ggml_backend_opencl2_context *)backend->context;
cl_command_queue queue = backend_ctx->queue; cl_command_queue queue = backend_ctx->queue;
@ -1996,13 +1995,13 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nb02)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb10)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb11)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb1)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &nb2)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1}; size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
size_t local_work_size[] = {1, 1, 1}; size_t local_work_size[] = {1, 1, 1};
@ -2068,7 +2067,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
cl_ulong offsetd = extrad->offset + dst->view_offs; cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false; bool bcast_row = false;
int nb = ne00;
cl_kernel kernel; cl_kernel kernel;
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
@ -2078,7 +2076,7 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_ASSERT(ne11 == 1); GGML_ASSERT(ne11 == 1);
bcast_row = true; bcast_row = true;
nb = ne00 / 4; int ne = ne00 / 4;
kernel = backend_ctx->kernel_add_row; kernel = backend_ctx->kernel_add_row;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
@ -2087,7 +2085,7 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else { } else {
kernel = backend_ctx->kernel_add; kernel = backend_ctx->kernel_add;
@ -2167,30 +2165,30 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
const int ne02 = src0 ? src0->ne[2] : 0; const int ne02 = src0 ? src0->ne[2] : 0;
const int ne03 = src0 ? src0->ne[3] : 0; const int ne03 = src0 ? src0->ne[3] : 0;
const int nb00 = src0 ? src0->nb[0] : 0; const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
const int nb01 = src0 ? src0->nb[1] : 0; const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const int nb02 = src0 ? src0->nb[2] : 0; const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const int nb03 = src0 ? src0->nb[3] : 0; const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
const int ne10 = src1 ? src1->ne[0] : 0; const int ne10 = src1 ? src1->ne[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0; const int ne11 = src1 ? src1->ne[1] : 0;
const int ne12 = src1 ? src1->ne[2] : 0; const int ne12 = src1 ? src1->ne[2] : 0;
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
const int nb10 = src1 ? src1->nb[0] : 0; const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const int nb11 = src1 ? src1->nb[1] : 0; const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const int nb12 = src1 ? src1->nb[2] : 0; const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
const int nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
const int ne0 = dst ? dst->ne[0] : 0; const int ne0 = dst ? dst->ne[0] : 0;
const int ne1 = dst ? dst->ne[1] : 0; const int ne1 = dst ? dst->ne[1] : 0;
const int ne2 = dst ? dst->ne[2] : 0; const int ne2 = dst ? dst->ne[2] : 0;
const int ne3 = dst ? dst->ne[3] : 0; const int ne3 = dst ? dst->ne[3] : 0;
const int nb0 = dst ? dst->nb[0] : 0; const cl_ulong nb0 = dst ? dst->nb[0] : 0;
const int nb1 = dst ? dst->nb[1] : 0; const cl_ulong nb1 = dst ? dst->nb[1] : 0;
const int nb2 = dst ? dst->nb[2] : 0; const cl_ulong nb2 = dst ? dst->nb[2] : 0;
const int nb3 = dst ? dst->nb[3] : 0; const cl_ulong nb3 = dst ? dst->nb[3] : 0;
ggml_backend_opencl2_context *backend_ctx = (ggml_backend_opencl2_context *)backend->context; ggml_backend_opencl2_context *backend_ctx = (ggml_backend_opencl2_context *)backend->context;
cl_command_queue queue = backend_ctx->queue; cl_command_queue queue = backend_ctx->queue;
@ -2204,7 +2202,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
cl_ulong offsetd = extrad->offset + dst->view_offs; cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false; bool bcast_row = false;
int nb = ne00;
cl_kernel kernel; cl_kernel kernel;
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
@ -2214,7 +2211,7 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_ASSERT(ne11 == 1); GGML_ASSERT(ne11 == 1);
bcast_row = true; bcast_row = true;
nb = ne00 / 4; int ne = ne00 / 4;
kernel = backend_ctx->kernel_mul_row; kernel = backend_ctx->kernel_mul_row;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
@ -2223,7 +2220,7 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else { } else {
kernel = backend_ctx->kernel_mul; kernel = backend_ctx->kernel_mul;
@ -2237,26 +2234,26 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb00)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb02)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &nb03)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &nb10)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &nb11)); CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &nb12)); CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &nb13)); CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &nb0)); CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &nb1)); CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(int), &nb2)); CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(int), &nb3)); CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
} }
if (bcast_row) { if (bcast_row) {
@ -2492,7 +2489,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
const int ne00 = src0 ? src0->ne[0] : 0; const int ne00 = src0 ? src0->ne[0] : 0;
const int nb01 = src0 ? src0->nb[1] : 0; const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
GGML_ASSERT(ggml_is_contiguous_1(src0)); GGML_ASSERT(ggml_is_contiguous_1(src0));
@ -2505,7 +2502,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
@ -2549,7 +2546,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
const int ne00 = src0 ? src0->ne[0] : 0; const int ne00 = src0 ? src0->ne[0] : 0;
const int nb01 = src0 ? src0->nb[1] : 0; const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
GGML_ASSERT(ne00 % 4 == 0); GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ggml_is_contiguous_1(src0)); GGML_ASSERT(ggml_is_contiguous_1(src0));
@ -2578,7 +2575,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
// This is local memory - the size depends on subgroup size. // This is local memory - the size depends on subgroup size.
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
@ -2625,20 +2622,20 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
const int ne02 = src0 ? src0->ne[2] : 0; const int ne02 = src0 ? src0->ne[2] : 0;
const int ne03 = src0 ? src0->ne[3] : 0; const int ne03 = src0 ? src0->ne[3] : 0;
const int nb00 = src0 ? src0->nb[0] : 0; const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
const int nb01 = src0 ? src0->nb[1] : 0; const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const int nb02 = src0 ? src0->nb[2] : 0; const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const int nb03 = src0 ? src0->nb[3] : 0; const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
const int ne10 = src1 ? src1->ne[0] : 0; const int ne10 = src1 ? src1->ne[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0; const int ne11 = src1 ? src1->ne[1] : 0;
const int ne12 = src1 ? src1->ne[2] : 0; const int ne12 = src1 ? src1->ne[2] : 0;
const int ne13 = src1 ? src1->ne[3] : 0; const int ne13 = src1 ? src1->ne[3] : 0;
const int nb10 = src1 ? src1->nb[0] : 0; const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const int nb11 = src1 ? src1->nb[1] : 0; const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const int nb12 = src1 ? src1->nb[2] : 0; const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
const int nb13 = src1 ? src1->nb[3] : 0; const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
const int ne0 = dst ? dst->ne[0] : 0; const int ne0 = dst ? dst->ne[0] : 0;
const int ne1 = dst ? dst->ne[1] : 0; const int ne1 = dst ? dst->ne[1] : 0;
@ -3068,22 +3065,21 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &nb00)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb02)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb03)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &nb10)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &nb11)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &nb12)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &nb13)); CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0)); CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1)); CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2)); CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
break; break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
//GGML_ASSERT(ne02 == ne12); //GGML_ASSERT(ne02 == ne12);
@ -3121,17 +3117,17 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &nb00)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb02)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb03)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &nb10)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &nb11)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &nb12)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &nb13)); CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0)); CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1)); CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2)); CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
@ -3335,10 +3331,8 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
int n = ggml_nelements(dst)/4; int n = ggml_nelements(dst)/4;
@ -3372,20 +3366,20 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
const int ne02 = src0 ? src0->ne[2] : 0; const int ne02 = src0 ? src0->ne[2] : 0;
const int ne03 = src0 ? src0->ne[3] : 0; const int ne03 = src0 ? src0->ne[3] : 0;
const int nb00 = src0 ? src0->nb[0] : 0; const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
const int nb01 = src0 ? src0->nb[1] : 0; const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const int nb02 = src0 ? src0->nb[2] : 0; const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const int nb03 = src0 ? src0->nb[3] : 0; const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
const int ne10 = src1 ? src1->ne[0] : 0; const int ne10 = src1 ? src1->ne[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0; const int ne11 = src1 ? src1->ne[1] : 0;
const int ne12 = src1 ? src1->ne[2] : 0; const int ne12 = src1 ? src1->ne[2] : 0;
const int ne13 = src1 ? src1->ne[3] : 0; const int ne13 = src1 ? src1->ne[3] : 0;
const int nb10 = src1 ? src1->nb[0] : 0; const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const int nb11 = src1 ? src1->nb[1] : 0; const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const int nb12 = src1 ? src1->nb[2] : 0; const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
const int nb13 = src1 ? src1->nb[3] : 0; const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
@ -3438,18 +3432,18 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nb00)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb02)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb03)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &nb10)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &nb11)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &nb12)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &nb13)); CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
const int nth = MIN(64, ne00); const int nth = MIN(64, ne00);
@ -3754,18 +3748,18 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb00)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &nb02)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nb03)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne0)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne1)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne2)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne3)); CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne3));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &nb0)); CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &nb1)); CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &nb2)); CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &nb3)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_past)); CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_past));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &n_dims)); CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &n_dims));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &n_ctx_orig)); CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &n_ctx_orig));

View file

@ -298,7 +298,7 @@ kernel void kernel_add_row(
ulong offset1, ulong offset1,
global float4 * dst, global float4 * dst,
ulong offsetd, ulong offsetd,
int nb int ne
) { ) {
src0 = (global float4*)((global char*)src0 + offset0); src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1); src1 = (global float4*)((global char*)src1 + offset1);
@ -306,7 +306,7 @@ kernel void kernel_add_row(
// This performs better than using %. // This performs better than using %.
uint gid = get_global_id(0); uint gid = get_global_id(0);
uint idx1 = gid - (gid/nb)*nb; // get_global_id(0) % nb uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] + src1[idx1]; dst[gid] = src0[gid] + src1[idx1];
} }
@ -324,26 +324,26 @@ kernel void kernel_mul(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne10, int ne10,
int ne11, int ne11,
int ne12, int ne12,
int ne13, int ne13,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb12, ulong nb12,
int nb13, ulong nb13,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3 ulong nb3
) { ) {
src0 = src0 + offset0; src0 = src0 + offset0;
src1 = src1 + offset1; src1 = src1 + offset1;
@ -376,7 +376,7 @@ kernel void kernel_mul_row(
ulong offset1, ulong offset1,
global float4 * dst, global float4 * dst,
ulong offsetd, ulong offsetd,
int nb int ne
) { ) {
src0 = (global float4*)((global char*)src0 + offset0); src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1); src1 = (global float4*)((global char*)src1 + offset1);
@ -384,7 +384,7 @@ kernel void kernel_mul_row(
// This performs better than using %. // This performs better than using %.
uint gid = get_global_id(0); uint gid = get_global_id(0);
uint idx1 = gid - (gid/nb)*nb; // get_global_id(0) % nb uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] * src1[idx1]; dst[gid] = src0[gid] * src1[idx1];
} }
@ -509,7 +509,7 @@ kernel void kernel_norm(
global float * dst, global float * dst,
ulong offsetd, ulong offsetd,
int ne00, int ne00,
int nb01, ulong nb01,
float eps, float eps,
local float * sum local float * sum
) { ) {
@ -569,7 +569,7 @@ kernel void kernel_rms_norm(
global float * dst, global float * dst,
ulong offsetd, ulong offsetd,
int ne00, int ne00,
int nb01, ulong nb01,
float eps, float eps,
local float * sum // Note, the size depends on number of subgroups local float * sum // Note, the size depends on number of subgroups
) { ) {
@ -869,18 +869,18 @@ kernel void kernel_rope_norm_f32(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3, ulong nb3,
int n_past, int n_past,
int n_dims, int n_dims,
int n_ctx_orig, int n_ctx_orig,
@ -948,18 +948,18 @@ kernel void kernel_rope_norm_f16(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3, ulong nb3,
int n_past, int n_past,
int n_dims, int n_dims,
int n_ctx_orig, int n_ctx_orig,
@ -1027,18 +1027,18 @@ kernel void kernel_rope_neox_f32(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3, ulong nb3,
int n_past, int n_past,
int n_dims, int n_dims,
int n_ctx_orig, int n_ctx_orig,
@ -1106,18 +1106,18 @@ kernel void kernel_rope_neox_f16(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3, ulong nb3,
int n_past, int n_past,
int n_dims, int n_dims,
int n_ctx_orig, int n_ctx_orig,
@ -1185,18 +1185,18 @@ kernel void kernel_cpy_f16_f16(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3 ulong nb3
) { ) {
src0 = (global half*)((global char*)src0 + offset0); src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd); dst = (global half*)((global char*)dst + offsetd);
@ -1229,18 +1229,18 @@ kernel void kernel_cpy_f16_f32(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3 ulong nb3
) { ) {
src0 = (global half*)((global char*)src0 + offset0); src0 = (global half*)((global char*)src0 + offset0);
@ -1274,18 +1274,18 @@ kernel void kernel_cpy_f32_f16(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3 ulong nb3
) { ) {
src0 = (global float*)((global char*)src0 + offset0); src0 = (global float*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd); dst = (global half*)((global char*)dst + offsetd);
@ -1319,18 +1319,18 @@ kernel void kernel_cpy_f32_f32(
int ne01, int ne01,
int ne02, int ne02,
int ne03, int ne03,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne0, int ne0,
int ne1, int ne1,
int ne2, int ne2,
int ne3, int ne3,
int nb0, ulong nb0,
int nb1, ulong nb1,
int nb2, ulong nb2,
int nb3 ulong nb3
) { ) {
src0 = (global float*)((global char*)src0 + offset0); src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd); dst = (global float*)((global char*)dst + offsetd);
@ -1366,13 +1366,13 @@ kernel void kernel_get_rows_f32(
global float * dst, global float * dst,
ulong offsetd, ulong offsetd,
int ne00, int ne00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int ne10, int ne10,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb1, ulong nb1,
int nb2 ulong nb2
) { ) {
src0 = (global void*)((global char*)src0 + offset0); src0 = (global void*)((global char*)src0 + offset0);
src1 = (global int*)((global char*)src1 + offset1); src1 = (global int*)((global char*)src1 + offset1);
@ -1399,13 +1399,13 @@ kernel void kernel_get_rows_f16(
global float * dst, global float * dst,
ulong offsetd, ulong offsetd,
int ne00, int ne00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int ne10, int ne10,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb1, ulong nb1,
int nb2 ulong nb2
) { ) {
src0 = (global void*)((global char*)src0 + offset0); src0 = (global void*)((global char*)src0 + offset0);
src1 = (global int*)((global char*)src1 + offset1); src1 = (global int*)((global char*)src1 + offset1);
@ -1432,13 +1432,13 @@ kernel void kernel_get_rows_q4_0(
global float * dst, global float * dst,
ulong offsetd, ulong offsetd,
int ne00, int ne00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int ne10, int ne10,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb1, ulong nb1,
int nb2 ulong nb2
) { ) {
src0 = (global void*)((global char*)src0 + offset0); src0 = (global void*)((global char*)src0 + offset0);
src1 = (global int*)((global char*)src1 + offset1); src1 = (global int*)((global char*)src1 + offset1);
@ -1476,17 +1476,17 @@ kernel void kernel_mul_mat_f32_f32(
int ne00, int ne00,
int ne01, int ne01,
int ne02, int ne02,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne10, int ne10,
int ne11, int ne11,
int ne12, int ne12,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb12, ulong nb12,
int nb13, ulong nb13,
int ne0, int ne0,
int ne1, int ne1,
int r2, int r2,
@ -1575,17 +1575,17 @@ kernel void kernel_mul_mat_f16_f16(
int ne00, int ne00,
int ne01, int ne01,
int ne02, int ne02,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne10, int ne10,
int ne11, int ne11,
int ne12, int ne12,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb12, ulong nb12,
int nb13, ulong nb13,
int ne0, int ne0,
int ne1, int ne1,
int r2, int r2,
@ -1672,17 +1672,17 @@ kernel void kernel_mul_mat_f16_f32_1row(
int ne00, int ne00,
int ne01, int ne01,
int ne02, int ne02,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne10, int ne10,
int ne11, int ne11,
int ne12, int ne12,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb12, ulong nb12,
int nb13, ulong nb13,
int ne0, int ne0,
int ne1, int ne1,
int r2, int r2,
@ -1752,17 +1752,17 @@ kernel void kernel_mul_mat_f16_f32(
int ne00, int ne00,
int ne01, int ne01,
int ne02, int ne02,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne10, int ne10,
int ne11, int ne11,
int ne12, int ne12,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb12, ulong nb12,
int nb13, ulong nb13,
int ne0, int ne0,
int ne1, int ne1,
int r2, int r2,
@ -1853,17 +1853,17 @@ kernel void kernel_mul_mat_f16_f32_l4(
int ne00, int ne00,
int ne01, int ne01,
int ne02, int ne02,
int nb00, ulong nb00,
int nb01, ulong nb01,
int nb02, ulong nb02,
int nb03, ulong nb03,
int ne10, int ne10,
int ne11, int ne11,
int ne12, int ne12,
int nb10, ulong nb10,
int nb11, ulong nb11,
int nb12, ulong nb12,
int nb13, ulong nb13,
int ne0, int ne0,
int ne1, int ne1,
int r2, int r2,
@ -1954,7 +1954,7 @@ inline void mul_vec_q_n_f32(
int r3 int r3
) { ) {
const int nb = ne00/QK4_0; const ulong nb = ne00/QK4_0;
int r0 = get_group_id(0); int r0 = get_group_id(0);
int r1 = get_group_id(1); int r1 = get_group_id(1);
@ -2113,7 +2113,7 @@ inline void mul_vec_q_n_f32_v(
int r2, int r2,
int r3 int r3
) { ) {
const int nb = ne00/QK4_0; const ulong nb = ne00/QK4_0;
int r0 = get_group_id(0); int r0 = get_group_id(0);
int r1 = get_group_id(1); int r1 = get_group_id(1);
@ -2363,7 +2363,7 @@ inline void mul_vec_q_n_f32_flat(
int r2, int r2,
int r3 int r3
) { ) {
const int nb = ne00/QK4_0; const ulong nb = ne00/QK4_0;
int r0 = get_group_id(0); int r0 = get_group_id(0);
int r1 = get_group_id(1); int r1 = get_group_id(1);
@ -2530,7 +2530,7 @@ inline void mul_vec_q_n_f32_8x_flat(
int r2, int r2,
int r3 int r3
) { ) {
const int nb = ne00/QK4_0; const ulong nb = ne00/QK4_0;
int r0 = get_group_id(0); int r0 = get_group_id(0);
int r1 = get_group_id(1); int r1 = get_group_id(1);