opencl: use ulong for offsets and strides in ADD kernel
This commit is contained in:
parent
0451edd936
commit
31f305ea01
2 changed files with 82 additions and 82 deletions
|
@ -2031,30 +2031,30 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const int nb00 = src0 ? src0->nb[0] : 0;
|
||||
const int nb01 = src0 ? src0->nb[1] : 0;
|
||||
const int nb02 = src0 ? src0->nb[2] : 0;
|
||||
const int nb03 = src0 ? src0->nb[3] : 0;
|
||||
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
|
||||
const int ne10 = src1 ? src1->ne[0] : 0;
|
||||
const int ne11 = src1 ? src1->ne[1] : 0;
|
||||
const int ne12 = src1 ? src1->ne[2] : 0;
|
||||
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
|
||||
|
||||
const int nb10 = src1 ? src1->nb[0] : 0;
|
||||
const int nb11 = src1 ? src1->nb[1] : 0;
|
||||
const int nb12 = src1 ? src1->nb[2] : 0;
|
||||
const int nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
||||
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
|
||||
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
|
||||
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
|
||||
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
|
||||
|
||||
const int ne0 = dst ? dst->ne[0] : 0;
|
||||
const int ne1 = dst ? dst->ne[1] : 0;
|
||||
const int ne2 = dst ? dst->ne[2] : 0;
|
||||
const int ne3 = dst ? dst->ne[3] : 0;
|
||||
|
||||
const int nb0 = dst ? dst->nb[0] : 0;
|
||||
const int nb1 = dst ? dst->nb[1] : 0;
|
||||
const int nb2 = dst ? dst->nb[2] : 0;
|
||||
const int nb3 = dst ? dst->nb[3] : 0;
|
||||
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
||||
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
||||
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
||||
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
||||
|
||||
ggml_backend_opencl2_context *backend_ctx = (ggml_backend_opencl2_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
@ -2063,9 +2063,9 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
int offset0 = extra0->offset + src0->view_offs;
|
||||
int offset1 = extra1->offset + src1->view_offs;
|
||||
int offsetd = extrad->offset + dst->view_offs;
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
bool bcast_row = false;
|
||||
int nb = ne00;
|
||||
|
@ -2081,46 +2081,46 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|||
nb = ne00 / 4;
|
||||
kernel = backend_ctx->kernel_add_row;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_add;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(int), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(int), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
|
||||
}
|
||||
|
||||
if (bcast_row) {
|
||||
|
|
|
@ -237,35 +237,35 @@ void dequantize_q4_0_f16(global struct block_q4_0 * xb, short il, half16 * reg)
|
|||
// cons: not very efficient
|
||||
kernel void kernel_add(
|
||||
global char * src0,
|
||||
int offset0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
int offset1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
int offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
int nb00,
|
||||
int nb01,
|
||||
int nb02,
|
||||
int nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
int nb10,
|
||||
int nb11,
|
||||
int nb12,
|
||||
int nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
int nb0,
|
||||
int nb1,
|
||||
int nb2,
|
||||
int nb3
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
|
@ -293,11 +293,11 @@ kernel void kernel_add(
|
|||
// broadcast src1 into src0
|
||||
kernel void kernel_add_row(
|
||||
global float4 * src0,
|
||||
int offset0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
int offset1,
|
||||
ulong offset1,
|
||||
global float4 * dst,
|
||||
int offsetd,
|
||||
ulong offsetd,
|
||||
int nb
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue