diff --git a/ggml/src/ggml-opencl2/ggml-opencl2.cpp b/ggml/src/ggml-opencl2/ggml-opencl2.cpp index f71f9e71a..a74dac659 100644 --- a/ggml/src/ggml-opencl2/ggml-opencl2.cpp +++ b/ggml/src/ggml-opencl2/ggml-opencl2.cpp @@ -2031,30 +2031,30 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const const int ne02 = src0 ? src0->ne[2] : 0; const int ne03 = src0 ? src0->ne[3] : 0; - const int nb00 = src0 ? src0->nb[0] : 0; - const int nb01 = src0 ? src0->nb[1] : 0; - const int nb02 = src0 ? src0->nb[2] : 0; - const int nb03 = src0 ? src0->nb[3] : 0; + const cl_ulong nb00 = src0 ? src0->nb[0] : 0; + const cl_ulong nb01 = src0 ? src0->nb[1] : 0; + const cl_ulong nb02 = src0 ? src0->nb[2] : 0; + const cl_ulong nb03 = src0 ? src0->nb[3] : 0; const int ne10 = src1 ? src1->ne[0] : 0; const int ne11 = src1 ? src1->ne[1] : 0; const int ne12 = src1 ? src1->ne[2] : 0; const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); - const int nb10 = src1 ? src1->nb[0] : 0; - const int nb11 = src1 ? src1->nb[1] : 0; - const int nb12 = src1 ? src1->nb[2] : 0; - const int nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); + const cl_ulong nb10 = src1 ? src1->nb[0] : 0; + const cl_ulong nb11 = src1 ? src1->nb[1] : 0; + const cl_ulong nb12 = src1 ? src1->nb[2] : 0; + const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); const int ne0 = dst ? dst->ne[0] : 0; const int ne1 = dst ? dst->ne[1] : 0; const int ne2 = dst ? dst->ne[2] : 0; const int ne3 = dst ? dst->ne[3] : 0; - const int nb0 = dst ? dst->nb[0] : 0; - const int nb1 = dst ? dst->nb[1] : 0; - const int nb2 = dst ? dst->nb[2] : 0; - const int nb3 = dst ? dst->nb[3] : 0; + const cl_ulong nb0 = dst ? dst->nb[0] : 0; + const cl_ulong nb1 = dst ? dst->nb[1] : 0; + const cl_ulong nb2 = dst ? dst->nb[2] : 0; + const cl_ulong nb3 = dst ? dst->nb[3] : 0; ggml_backend_opencl2_context *backend_ctx = (ggml_backend_opencl2_context *)backend->context; cl_command_queue queue = backend_ctx->queue; @@ -2063,9 +2063,9 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offset1 = extra1->offset + src1->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; bool bcast_row = false; int nb = ne00; @@ -2081,46 +2081,46 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const nb = ne00 / 4; kernel = backend_ctx->kernel_add_row; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb)); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb)); } else { kernel = backend_ctx->kernel_add; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb00)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); - CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &nb10)); - CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &nb11)); - CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &nb12)); - CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &nb13)); - CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); - CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); - CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); - CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); - CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &nb0)); - CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &nb1)); - CL_CHECK(clSetKernelArg(kernel, 28, sizeof(int), &nb2)); - CL_CHECK(clSetKernelArg(kernel, 29, sizeof(int), &nb3)); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); + CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); + CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); + CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); + CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); + CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); } if (bcast_row) { diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl index e6c25b380..135bbe087 100644 --- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl +++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl @@ -237,35 +237,35 @@ void dequantize_q4_0_f16(global struct block_q4_0 * xb, short il, half16 * reg) // cons: not very efficient kernel void kernel_add( global char * src0, - int offset0, + ulong offset0, global char * src1, - int offset1, + ulong offset1, global char * dst, - int offsetd, - int ne00, - int ne01, - int ne02, - int ne03, - int nb00, - int nb01, - int nb02, - int nb03, - int ne10, - int ne11, - int ne12, - int ne13, - int nb10, - int nb11, - int nb12, - int nb13, - int ne0, - int ne1, - int ne2, - int ne3, - int nb0, - int nb1, - int nb2, - int nb3 + ulong offsetd, + int ne00, + int ne01, + int ne02, + int ne03, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + int ne13, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int ne2, + int ne3, + ulong nb0, + ulong nb1, + ulong nb2, + ulong nb3 ) { src0 = src0 + offset0; src1 = src1 + offset1; @@ -293,11 +293,11 @@ kernel void kernel_add( // broadcast src1 into src0 kernel void kernel_add_row( global float4 * src0, - int offset0, + ulong offset0, global float4 * src1, - int offset1, + ulong offset1, global float4 * dst, - int offsetd, + ulong offsetd, int nb ) { src0 = (global float4*)((global char*)src0 + offset0);