opencl: use cl_ulong for all offsets

This commit is contained in:
Max Krasnyansky 2024-12-07 17:44:42 -08:00
parent 31f305ea01
commit c21fc8c5f9
5 changed files with 228 additions and 228 deletions

View file

@ -799,7 +799,7 @@ struct ggml_tensor_extra_cl {
// and view operation.
// NB: this offset no longer includes view offset (view_offs). Whenever this
// offset is used, view_offs should be considered.
size_t offset;
cl_ulong offset;
// The actual size of the cl_mem object. This is needed when returning the
// block to the pool.
size_t actual_size;
@ -1969,9 +1969,9 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offset1 = extra1->offset + src1->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
@ -1990,11 +1990,11 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nb02));
@ -2199,9 +2199,9 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offset1 = extra1->offset + src1->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false;
int nb = ne00;
@ -2218,21 +2218,21 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
kernel = backend_ctx->kernel_mul_row;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb));
} else {
kernel = backend_ctx->kernel_mul;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -2304,8 +2304,8 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
@ -2319,9 +2319,9 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
@ -2351,8 +2351,8 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
@ -2366,9 +2366,9 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
@ -2398,15 +2398,15 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel = backend_ctx->kernel_relu;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
const int64_t n = ggml_nelements(dst);
@ -2438,8 +2438,8 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
float min;
float max;
@ -2449,9 +2449,9 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
cl_kernel kernel = backend_ctx->kernel_clamp;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &min));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &max));
@ -2485,8 +2485,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
@ -2501,9 +2501,9 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
cl_kernel kernel = backend_ctx->kernel_norm;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &nb01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
@ -2542,8 +2542,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
@ -2574,9 +2574,9 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
sizeof(size_t), &sgs, NULL));
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &nb01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
@ -2612,9 +2612,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offset1 = extra1->offset + src1->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
#ifdef GGML_OPENCL_SOA_Q
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
@ -2861,12 +2861,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
cl_uint k_arg = 0;
if (N == 1) {
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extra1->offset));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
@ -2996,9 +2996,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -3060,11 +3060,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -3113,11 +3113,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -3162,9 +3162,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -3196,11 +3196,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -3232,11 +3232,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -3328,16 +3328,16 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel = backend_ctx->kernel_scale;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
@ -3396,8 +3396,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
int offset0 = extra0->offset + src0->view_offs;
int offset1 = extra1->offset + src1->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_kernel kernel;
@ -3431,9 +3431,9 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
@ -3492,8 +3492,8 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
@ -3501,9 +3501,9 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
kernel = backend_ctx->kernel_diag_mask_inf_8;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
@ -3524,9 +3524,9 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
kernel = backend_ctx->kernel_diag_mask_inf;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
@ -3569,10 +3569,10 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
int offset0 = extra0->offset + src0->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
int offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
const int ne00 = src0 ? src0->ne[0] : 0;
const int ne01 = src0 ? src0->ne[1] : 0;
@ -3614,11 +3614,11 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
@ -3657,14 +3657,14 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
int offset0 = extra0->offset + src0->view_offs;
int offset1 = extra1->offset + src1->view_offs;
int offsetd = extrad->offset + dst->view_offs;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
ggml_tensor * src2 = dst->src[2];
ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
int offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
const int ne00 = src0 ? src0->ne[0] : 0;
const int ne01 = src0 ? src0->ne[1] : 0;
@ -3743,13 +3743,13 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offset2));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));

View file

@ -315,11 +315,11 @@ kernel void kernel_add_row(
//------------------------------------------------------------------------------
kernel void kernel_mul(
global char * src0,
int offset0,
ulong offset0,
global char * src1,
int offset1,
ulong offset1,
global char * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -371,11 +371,11 @@ kernel void kernel_mul(
// broadcast src1 into src0
kernel void kernel_mul_row(
global float4 * src0,
int offset0,
ulong offset0,
global float4 * src1,
int offset1,
ulong offset1,
global float4 * dst,
int offsetd,
ulong offsetd,
int nb
) {
src0 = (global float4*)((global char*)src0 + offset0);
@ -393,9 +393,9 @@ kernel void kernel_mul_row(
//------------------------------------------------------------------------------
kernel void kernel_scale(
global float4 * src0,
int offset0,
ulong offset0,
global float4 * dst,
int offsetd,
ulong offsetd,
float scale
) {
src0 = (global float4*)((global char*)src0 + offset0);
@ -411,9 +411,9 @@ kernel void kernel_scale(
kernel void kernel_gelu(
global float * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
@ -425,9 +425,9 @@ kernel void kernel_gelu(
kernel void kernel_gelu_4(
global float4 * src0,
int offset0,
ulong offset0,
global float4 * dst,
int offsetd
ulong offsetd
) {
src0 = (global float4*)((global char*)src0 + offset0);
dst = (global float4*)((global char*)dst + offsetd);
@ -442,9 +442,9 @@ kernel void kernel_gelu_4(
//------------------------------------------------------------------------------
kernel void kernel_silu(
global float * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
@ -455,9 +455,9 @@ kernel void kernel_silu(
kernel void kernel_silu_4(
global float4 * src0,
int offset0,
ulong offset0,
global float4 * dst,
int offsetd
ulong offsetd
) {
src0 = (global float4*)((global char*)src0 + offset0);
dst = (global float4*)((global char*)dst + offsetd);
@ -471,9 +471,9 @@ kernel void kernel_silu_4(
//------------------------------------------------------------------------------
kernel void kernel_relu(
global float * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
@ -486,9 +486,9 @@ kernel void kernel_relu(
//------------------------------------------------------------------------------
kernel void kernel_clamp(
global float * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd,
ulong offsetd,
float min,
float max
) {
@ -505,9 +505,9 @@ kernel void kernel_clamp(
//------------------------------------------------------------------------------
kernel void kernel_norm(
global void * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int nb01,
float eps,
@ -565,9 +565,9 @@ kernel void kernel_norm(
// This kernel depends on subgroup size.
kernel void kernel_rms_norm(
global void * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int nb01,
float eps,
@ -627,9 +627,9 @@ kernel void kernel_rms_norm(
//------------------------------------------------------------------------------
kernel void kernel_diag_mask_inf(
global float * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int n_past
@ -650,9 +650,9 @@ kernel void kernel_diag_mask_inf(
kernel void kernel_diag_mask_inf_8(
global float4 * src0,
int offset0,
ulong offset0,
global float4 * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int n_past
@ -684,11 +684,11 @@ kernel void kernel_diag_mask_inf_8(
//------------------------------------------------------------------------------
kernel void kernel_soft_max(
global float * src0,
int offset0,
ulong offset0,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -751,11 +751,11 @@ REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_soft_max_4(
global float * src0,
int offset0,
ulong offset0,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -858,13 +858,13 @@ float2 rope_yarn_corr_dims(
kernel void kernel_rope_norm_f32(
global void * src0,
int offset0,
ulong offset0,
global int * src1,
int offset1,
ulong offset1,
global float * src2,
int offset2,
ulong offset2,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -937,13 +937,13 @@ kernel void kernel_rope_norm_f32(
kernel void kernel_rope_norm_f16(
global void * src0,
int offset0,
ulong offset0,
global int * src1,
int offset1,
ulong offset1,
global float * src2,
int offset2,
ulong offset2,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1016,13 +1016,13 @@ kernel void kernel_rope_norm_f16(
kernel void kernel_rope_neox_f32(
global void * src0,
int offset0,
ulong offset0,
global int * src1,
int offset1,
ulong offset1,
global float * src2,
int offset2,
ulong offset2,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1095,13 +1095,13 @@ kernel void kernel_rope_neox_f32(
kernel void kernel_rope_neox_f16(
global void * src0,
int offset0,
ulong offset0,
global int * src1,
int offset1,
ulong offset1,
global float * src2,
int offset2,
ulong offset2,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1178,9 +1178,9 @@ kernel void kernel_rope_neox_f16(
kernel void kernel_cpy_f16_f16(
global half * src0,
int offset0,
ulong offset0,
global half * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1222,9 +1222,9 @@ kernel void kernel_cpy_f16_f16(
kernel void kernel_cpy_f16_f32(
global half * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1267,9 +1267,9 @@ kernel void kernel_cpy_f16_f32(
kernel void kernel_cpy_f32_f16(
global float * src0,
int offset0,
ulong offset0,
global half * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1312,9 +1312,9 @@ kernel void kernel_cpy_f32_f16(
kernel void kernel_cpy_f32_f32(
global float * src0,
int offset0,
ulong offset0,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1360,11 +1360,11 @@ kernel void kernel_cpy_f32_f32(
//------------------------------------------------------------------------------
kernel void kernel_get_rows_f32(
global void * src0,
int offset0,
ulong offset0,
global int * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int nb01,
int nb02,
@ -1393,11 +1393,11 @@ kernel void kernel_get_rows_f32(
kernel void kernel_get_rows_f16(
global void * src0,
int offset0,
ulong offset0,
global int * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int nb01,
int nb02,
@ -1426,11 +1426,11 @@ kernel void kernel_get_rows_f16(
kernel void kernel_get_rows_q4_0(
global void * src0,
int offset0,
ulong offset0,
global int * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int nb01,
int nb02,
@ -1468,11 +1468,11 @@ kernel void kernel_get_rows_q4_0(
kernel void kernel_mul_mat_f32_f32(
global char * src0,
int offset0,
ulong offset0,
global char * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1503,7 +1503,7 @@ kernel void kernel_mul_mat_f32_f32(
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
global float * x = (global float *) (src0 + offset_src0);
@ -1514,7 +1514,7 @@ kernel void kernel_mul_mat_f32_f32(
break;
}
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global float * y = (global float *) (src1 + offset_src1);
@ -1536,7 +1536,7 @@ kernel void kernel_mul_mat_f32_f32(
break;
}
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global float * y = (global float *) (src1 + offset_src1);
global float4 * y4 = (global float4 *) y;
@ -1567,11 +1567,11 @@ kernel void kernel_mul_mat_f32_f32(
kernel void kernel_mul_mat_f16_f16(
global char * src0,
int offset0,
ulong offset0,
global char * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1602,7 +1602,7 @@ kernel void kernel_mul_mat_f16_f16(
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
global half * x = (global half *) (src0 + offset_src0);
@ -1613,7 +1613,7 @@ kernel void kernel_mul_mat_f16_f16(
break;
}
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global half * y = (global half *) (src1 + offset_src1);
@ -1635,7 +1635,7 @@ kernel void kernel_mul_mat_f16_f16(
break;
}
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global half * y = (global half *) (src1 + offset_src1);
global half4 * y4 = (global half4 *) y;
@ -1664,11 +1664,11 @@ kernel void kernel_mul_mat_f16_f16(
//------------------------------------------------------------------------------
kernel void kernel_mul_mat_f16_f32_1row(
global char * src0,
int offset0,
ulong offset0,
global char * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1699,8 +1699,8 @@ kernel void kernel_mul_mat_f16_f32_1row(
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global half * x = (global half *) (src0 + offset_src0);
global float * y = (global float *) (src1 + offset_src1);
@ -1744,11 +1744,11 @@ REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mat_f16_f32(
global char * src0,
int offset0,
ulong offset0,
global char * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1779,7 +1779,7 @@ kernel void kernel_mul_mat_f16_f32(
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
global half * x = (global half *) (src0 + offset_src0);
@ -1790,7 +1790,7 @@ kernel void kernel_mul_mat_f16_f32(
break;
}
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global float * y = (global float *) (src1 + offset_src1);
@ -1812,7 +1812,7 @@ kernel void kernel_mul_mat_f16_f32(
break;
}
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global float * y = (global float *) (src1 + offset_src1);
global float4 * y4 = (global float4 *) y;
@ -1845,11 +1845,11 @@ REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mat_f16_f32_l4(
global char * src0,
int offset0,
ulong offset0,
global char * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1880,12 +1880,12 @@ kernel void kernel_mul_mat_f16_f32_l4(
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
global half4 * x4 = (global half4 *) (src0 + offset_src0);
for (int r1 = 0; r1 < nrows; ++r1) {
int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13;
global float4 * y4 = (global float4 *) (src1 + offset_src1);
@ -1967,7 +1967,7 @@ inline void mul_vec_q_n_f32(
int i12 = im%ne12;
int i13 = im/ne12;
int offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
@ -2026,11 +2026,11 @@ REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mat_q4_0_f32(
global void * src0,
int offset0,
ulong offset0,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -2126,7 +2126,7 @@ inline void mul_vec_q_n_f32_v(
int i12 = im%ne12;
int i13 = im/ne12;
int offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
@ -2232,11 +2232,11 @@ REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mat_q4_0_f32_v(
global void * src0,
int offset0,
ulong offset0,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -2380,9 +2380,9 @@ inline void mul_vec_q_n_f32_flat(
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
@ -2479,9 +2479,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat(
global uchar * src0_q,
global half * src0_d,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -2547,9 +2547,9 @@ inline void mul_vec_q_n_f32_8x_flat(
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
@ -2666,9 +2666,9 @@ kernel void kernel_mul_mat_q4_0_f32_8x_flat(
global uchar * src0_q,
global half * src0_d,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,

View file

@ -195,9 +195,9 @@ __kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
int offset1, // offset to B (0)
ulong offset1, // offset to B (0)
global float * dst, // C
int offsetd, // offset to C (0)
ulong offsetd, // offset to C (0)
uint K, // K
int ne01, // M
int ne02, // 1

View file

@ -195,9 +195,9 @@ __kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
int offset1, // offset to B (0)
ulong offset1, // offset to B (0)
global float * dst, // C
int offsetd, // offset to C (0)
ulong offsetd, // offset to C (0)
int ne00, // K
int ne01, // M
int ne02, // 1

View file

@ -170,9 +170,9 @@ inline void mul_mat_q_n_f32_1d_8x_flat(
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
@ -289,9 +289,9 @@ kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
global uchar * src0_q,
global half * src0_d,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -357,9 +357,9 @@ inline void mul_mat_q_n_f32_1d_16x_flat(
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
@ -518,9 +518,9 @@ kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
global uchar * src0_q,
global half * src0_d,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -661,9 +661,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_v0(
global uchar * src0_q,
global half * src0_d,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -689,9 +689,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_v0(
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
@ -909,9 +909,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_img_v0(
read_only image1d_buffer_t src0_q,
read_only image1d_buffer_t src0_d,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -937,9 +937,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_img_v0(
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
int offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
@ -1117,11 +1117,11 @@ REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_q6_K_f32(
global void * src0,
int offset0,
ulong offset0,
global float * src1,
int offset1,
ulong offset1,
global float * dst,
int offsetd,
ulong offsetd,
int ne00,
int ne01,
int ne02,
@ -1152,7 +1152,7 @@ kernel void kernel_mul_mv_q6_K_f32(
int i12 = im%ne12;
int i13 = im/ne12;
int offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;