diff --git a/ggml/src/ggml-opencl2/ggml-opencl2.cpp b/ggml/src/ggml-opencl2/ggml-opencl2.cpp index a74dac659..451efb8de 100644 --- a/ggml/src/ggml-opencl2/ggml-opencl2.cpp +++ b/ggml/src/ggml-opencl2/ggml-opencl2.cpp @@ -799,7 +799,7 @@ struct ggml_tensor_extra_cl { // and view operation. // NB: this offset no longer includes view offset (view_offs). Whenever this // offset is used, view_offs should be considered. - size_t offset; + cl_ulong offset; // The actual size of the cl_mem object. This is needed when returning the // block to the pool. size_t actual_size; @@ -1969,9 +1969,9 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offset1 = extra1->offset + src1->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; cl_kernel kernel; @@ -1990,11 +1990,11 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nb02)); @@ -2199,9 +2199,9 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offset1 = extra1->offset + src1->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; bool bcast_row = false; int nb = ne00; @@ -2218,21 +2218,21 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const kernel = backend_ctx->kernel_mul_row; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &nb)); } else { kernel = backend_ctx->kernel_mul; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -2304,8 +2304,8 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; cl_kernel kernel; @@ -2319,9 +2319,9 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; @@ -2351,8 +2351,8 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; cl_kernel kernel; @@ -2366,9 +2366,9 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; @@ -2398,15 +2398,15 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; cl_kernel kernel = backend_ctx->kernel_relu; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); const int64_t n = ggml_nelements(dst); @@ -2438,8 +2438,8 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; float min; float max; @@ -2449,9 +2449,9 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons cl_kernel kernel = backend_ctx->kernel_clamp; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &min)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &max)); @@ -2485,8 +2485,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -2501,9 +2501,9 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const cl_kernel kernel = backend_ctx->kernel_norm; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps)); @@ -2542,8 +2542,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -2574,9 +2574,9 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c sizeof(size_t), &sgs, NULL)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &nb01)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps)); @@ -2612,9 +2612,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offset1 = extra1->offset + src1->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; #ifdef GGML_OPENCL_SOA_Q ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; @@ -2861,12 +2861,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co cl_uint k_arg = 0; if (N == 1) { - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extra1->offset)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset)); CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02)); @@ -2996,9 +2996,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -3060,11 +3060,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -3113,11 +3113,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -3162,9 +3162,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -3196,11 +3196,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -3232,11 +3232,11 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -3328,16 +3328,16 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; cl_kernel kernel = backend_ctx->kernel_scale; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale)); @@ -3396,8 +3396,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; - int offset0 = extra0->offset + src0->view_offs; - int offset1 = extra1->offset + src1->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; cl_kernel kernel; @@ -3431,9 +3431,9 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); @@ -3492,8 +3492,8 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; cl_kernel kernel; @@ -3501,9 +3501,9 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr kernel = backend_ctx->kernel_diag_mask_inf_8; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); @@ -3524,9 +3524,9 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr kernel = backend_ctx->kernel_diag_mask_inf; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); @@ -3569,10 +3569,10 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr; - int offset0 = extra0->offset + src0->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; - int offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; + cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; const int ne00 = src0 ? src0->ne[0] : 0; const int ne01 = src0 ? src0->ne[1] : 0; @@ -3614,11 +3614,11 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); @@ -3657,14 +3657,14 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - int offset0 = extra0->offset + src0->view_offs; - int offset1 = extra1->offset + src1->view_offs; - int offsetd = extrad->offset + dst->view_offs; + cl_ulong offset0 = extra0->offset + src0->view_offs; + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; ggml_tensor * src2 = dst->src[2]; ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr; - int offset2 = extra2 ? extra2->offset + src2->view_offs : offset0; + cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0; const int ne00 = src0 ? src0->ne[0] : 0; const int ne01 = src0 ? src0->ne[1] : 0; @@ -3743,13 +3743,13 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const } CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offset2)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl index 135bbe087..cf64bf7d0 100644 --- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl +++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2.cl @@ -315,11 +315,11 @@ kernel void kernel_add_row( //------------------------------------------------------------------------------ kernel void kernel_mul( global char * src0, - int offset0, + ulong offset0, global char * src1, - int offset1, + ulong offset1, global char * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -371,11 +371,11 @@ kernel void kernel_mul( // broadcast src1 into src0 kernel void kernel_mul_row( global float4 * src0, - int offset0, + ulong offset0, global float4 * src1, - int offset1, + ulong offset1, global float4 * dst, - int offsetd, + ulong offsetd, int nb ) { src0 = (global float4*)((global char*)src0 + offset0); @@ -393,9 +393,9 @@ kernel void kernel_mul_row( //------------------------------------------------------------------------------ kernel void kernel_scale( global float4 * src0, - int offset0, + ulong offset0, global float4 * dst, - int offsetd, + ulong offsetd, float scale ) { src0 = (global float4*)((global char*)src0 + offset0); @@ -411,9 +411,9 @@ kernel void kernel_scale( kernel void kernel_gelu( global float * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd + ulong offsetd ) { src0 = (global float*)((global char*)src0 + offset0); dst = (global float*)((global char*)dst + offsetd); @@ -425,9 +425,9 @@ kernel void kernel_gelu( kernel void kernel_gelu_4( global float4 * src0, - int offset0, + ulong offset0, global float4 * dst, - int offsetd + ulong offsetd ) { src0 = (global float4*)((global char*)src0 + offset0); dst = (global float4*)((global char*)dst + offsetd); @@ -442,9 +442,9 @@ kernel void kernel_gelu_4( //------------------------------------------------------------------------------ kernel void kernel_silu( global float * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd + ulong offsetd ) { src0 = (global float*)((global char*)src0 + offset0); dst = (global float*)((global char*)dst + offsetd); @@ -455,9 +455,9 @@ kernel void kernel_silu( kernel void kernel_silu_4( global float4 * src0, - int offset0, + ulong offset0, global float4 * dst, - int offsetd + ulong offsetd ) { src0 = (global float4*)((global char*)src0 + offset0); dst = (global float4*)((global char*)dst + offsetd); @@ -471,9 +471,9 @@ kernel void kernel_silu_4( //------------------------------------------------------------------------------ kernel void kernel_relu( global float * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd + ulong offsetd ) { src0 = (global float*)((global char*)src0 + offset0); dst = (global float*)((global char*)dst + offsetd); @@ -486,9 +486,9 @@ kernel void kernel_relu( //------------------------------------------------------------------------------ kernel void kernel_clamp( global float * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd, + ulong offsetd, float min, float max ) { @@ -505,9 +505,9 @@ kernel void kernel_clamp( //------------------------------------------------------------------------------ kernel void kernel_norm( global void * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd, + ulong offsetd, int ne00, int nb01, float eps, @@ -565,9 +565,9 @@ kernel void kernel_norm( // This kernel depends on subgroup size. kernel void kernel_rms_norm( global void * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd, + ulong offsetd, int ne00, int nb01, float eps, @@ -627,9 +627,9 @@ kernel void kernel_rms_norm( //------------------------------------------------------------------------------ kernel void kernel_diag_mask_inf( global float * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int n_past @@ -650,9 +650,9 @@ kernel void kernel_diag_mask_inf( kernel void kernel_diag_mask_inf_8( global float4 * src0, - int offset0, + ulong offset0, global float4 * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int n_past @@ -684,11 +684,11 @@ kernel void kernel_diag_mask_inf_8( //------------------------------------------------------------------------------ kernel void kernel_soft_max( global float * src0, - int offset0, + ulong offset0, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -751,11 +751,11 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_soft_max_4( global float * src0, - int offset0, + ulong offset0, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -858,13 +858,13 @@ float2 rope_yarn_corr_dims( kernel void kernel_rope_norm_f32( global void * src0, - int offset0, + ulong offset0, global int * src1, - int offset1, + ulong offset1, global float * src2, - int offset2, + ulong offset2, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -937,13 +937,13 @@ kernel void kernel_rope_norm_f32( kernel void kernel_rope_norm_f16( global void * src0, - int offset0, + ulong offset0, global int * src1, - int offset1, + ulong offset1, global float * src2, - int offset2, + ulong offset2, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1016,13 +1016,13 @@ kernel void kernel_rope_norm_f16( kernel void kernel_rope_neox_f32( global void * src0, - int offset0, + ulong offset0, global int * src1, - int offset1, + ulong offset1, global float * src2, - int offset2, + ulong offset2, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1095,13 +1095,13 @@ kernel void kernel_rope_neox_f32( kernel void kernel_rope_neox_f16( global void * src0, - int offset0, + ulong offset0, global int * src1, - int offset1, + ulong offset1, global float * src2, - int offset2, + ulong offset2, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1178,9 +1178,9 @@ kernel void kernel_rope_neox_f16( kernel void kernel_cpy_f16_f16( global half * src0, - int offset0, + ulong offset0, global half * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1222,9 +1222,9 @@ kernel void kernel_cpy_f16_f16( kernel void kernel_cpy_f16_f32( global half * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1267,9 +1267,9 @@ kernel void kernel_cpy_f16_f32( kernel void kernel_cpy_f32_f16( global float * src0, - int offset0, + ulong offset0, global half * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1312,9 +1312,9 @@ kernel void kernel_cpy_f32_f16( kernel void kernel_cpy_f32_f32( global float * src0, - int offset0, + ulong offset0, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1360,11 +1360,11 @@ kernel void kernel_cpy_f32_f32( //------------------------------------------------------------------------------ kernel void kernel_get_rows_f32( global void * src0, - int offset0, + ulong offset0, global int * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int nb01, int nb02, @@ -1393,11 +1393,11 @@ kernel void kernel_get_rows_f32( kernel void kernel_get_rows_f16( global void * src0, - int offset0, + ulong offset0, global int * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int nb01, int nb02, @@ -1426,11 +1426,11 @@ kernel void kernel_get_rows_f16( kernel void kernel_get_rows_q4_0( global void * src0, - int offset0, + ulong offset0, global int * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int nb01, int nb02, @@ -1468,11 +1468,11 @@ kernel void kernel_get_rows_q4_0( kernel void kernel_mul_mat_f32_f32( global char * src0, - int offset0, + ulong offset0, global char * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1503,7 +1503,7 @@ kernel void kernel_mul_mat_f32_f32( int i12 = im%ne12; int i13 = im/ne12; - int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; global float * x = (global float *) (src0 + offset_src0); @@ -1514,7 +1514,7 @@ kernel void kernel_mul_mat_f32_f32( break; } - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global float * y = (global float *) (src1 + offset_src1); @@ -1536,7 +1536,7 @@ kernel void kernel_mul_mat_f32_f32( break; } - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global float * y = (global float *) (src1 + offset_src1); global float4 * y4 = (global float4 *) y; @@ -1567,11 +1567,11 @@ kernel void kernel_mul_mat_f32_f32( kernel void kernel_mul_mat_f16_f16( global char * src0, - int offset0, + ulong offset0, global char * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1602,7 +1602,7 @@ kernel void kernel_mul_mat_f16_f16( int i12 = im%ne12; int i13 = im/ne12; - int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; global half * x = (global half *) (src0 + offset_src0); @@ -1613,7 +1613,7 @@ kernel void kernel_mul_mat_f16_f16( break; } - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global half * y = (global half *) (src1 + offset_src1); @@ -1635,7 +1635,7 @@ kernel void kernel_mul_mat_f16_f16( break; } - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global half * y = (global half *) (src1 + offset_src1); global half4 * y4 = (global half4 *) y; @@ -1664,11 +1664,11 @@ kernel void kernel_mul_mat_f16_f16( //------------------------------------------------------------------------------ kernel void kernel_mul_mat_f16_f32_1row( global char * src0, - int offset0, + ulong offset0, global char * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1699,8 +1699,8 @@ kernel void kernel_mul_mat_f16_f32_1row( int i12 = im%ne12; int i13 = im/ne12; - int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global half * x = (global half *) (src0 + offset_src0); global float * y = (global float *) (src1 + offset_src1); @@ -1744,11 +1744,11 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_mul_mat_f16_f32( global char * src0, - int offset0, + ulong offset0, global char * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1779,7 +1779,7 @@ kernel void kernel_mul_mat_f16_f32( int i12 = im%ne12; int i13 = im/ne12; - int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; global half * x = (global half *) (src0 + offset_src0); @@ -1790,7 +1790,7 @@ kernel void kernel_mul_mat_f16_f32( break; } - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global float * y = (global float *) (src1 + offset_src1); @@ -1812,7 +1812,7 @@ kernel void kernel_mul_mat_f16_f32( break; } - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global float * y = (global float *) (src1 + offset_src1); global float4 * y4 = (global float4 *) y; @@ -1845,11 +1845,11 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_mul_mat_f16_f32_l4( global char * src0, - int offset0, + ulong offset0, global char * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1880,12 +1880,12 @@ kernel void kernel_mul_mat_f16_f32_l4( int i12 = im%ne12; int i13 = im/ne12; - int offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; + ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03; global half4 * x4 = (global half4 *) (src0 + offset_src0); for (int r1 = 0; r1 < nrows; ++r1) { - int offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; + ulong offset_src1 = r1*nb11 + (i12 )*nb12 + (i13 )*nb13; global float4 * y4 = (global float4 *) (src1 + offset_src1); @@ -1967,7 +1967,7 @@ inline void mul_vec_q_n_f32( int i12 = im%ne12; int i13 = im/ne12; - int offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0; global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; @@ -2026,11 +2026,11 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_mul_mat_q4_0_f32( global void * src0, - int offset0, + ulong offset0, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -2126,7 +2126,7 @@ inline void mul_vec_q_n_f32_v( int i12 = im%ne12; int i13 = im/ne12; - int offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0; global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; @@ -2232,11 +2232,11 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_mul_mat_q4_0_f32_v( global void * src0, - int offset0, + ulong offset0, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -2380,9 +2380,9 @@ inline void mul_vec_q_n_f32_flat( int i13 = im/ne12; // The number of scales is the same as the number of blocks. - int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); // Each block contains QK4_0/2 uchars, hence offset for qs is as follows. - int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; + ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; global uchar * x = (global uchar *) src0_q + offset0_q; global half * d = (global half *) src0_d + offset0_d; @@ -2479,9 +2479,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat( global uchar * src0_q, global half * src0_d, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -2547,9 +2547,9 @@ inline void mul_vec_q_n_f32_8x_flat( int i13 = im/ne12; // The number of scales is the same as the number of blocks. - int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); // Each block contains QK4_0/2 uchars, hence offset for qs is as follows. - int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; + ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; global uchar * x = (global uchar *) src0_q + offset0_q; global half * d = (global half *) src0_d + offset0_d; @@ -2666,9 +2666,9 @@ kernel void kernel_mul_mat_q4_0_f32_8x_flat( global uchar * src0_q, global half * src0_d, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl index 2f5c41f55..97e55e2c6 100644 --- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl +++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle.cl @@ -195,9 +195,9 @@ __kernel void kernel_gemv_noshuffle( __read_only image1d_buffer_t src0_q, // quantized A global half2 * src0_d, // A scales __read_only image1d_buffer_t src1, // B - int offset1, // offset to B (0) + ulong offset1, // offset to B (0) global float * dst, // C - int offsetd, // offset to C (0) + ulong offsetd, // offset to C (0) uint K, // K int ne01, // M int ne02, // 1 diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl index 5326af3fe..9451f0bba 100644 --- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl +++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_gemv_noshuffle_general.cl @@ -195,9 +195,9 @@ __kernel void kernel_gemv_noshuffle( __read_only image1d_buffer_t src0_q, // quantized A global half2 * src0_d, // A scales __read_only image1d_buffer_t src1, // B - int offset1, // offset to B (0) + ulong offset1, // offset to B (0) global float * dst, // C - int offsetd, // offset to C (0) + ulong offsetd, // offset to C (0) int ne00, // K int ne01, // M int ne02, // 1 diff --git a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl index 91311ee73..e83856013 100644 --- a/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl +++ b/ggml/src/ggml-opencl2/kernels/ggml-opencl2_mm.cl @@ -170,9 +170,9 @@ inline void mul_mat_q_n_f32_1d_8x_flat( int i13 = im/ne12; // The number of scales is the same as the number of blocks. - int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); // Each block contains QK4_0/2 uchars, hence offset for qs is as follows. - int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; + ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; global uchar * x = (global uchar *) src0_q + offset0_q; global half * d = (global half *) src0_d + offset0_d; @@ -289,9 +289,9 @@ kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat( global uchar * src0_q, global half * src0_d, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -357,9 +357,9 @@ inline void mul_mat_q_n_f32_1d_16x_flat( int i13 = im/ne12; // The number of scales is the same as the number of blocks. - int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); // Each block contains QK4_0/2 uchars, hence offset for qs is as follows. - int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; + ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; global uchar * x = (global uchar *) src0_q + offset0_q; global half * d = (global half *) src0_d + offset0_d; @@ -518,9 +518,9 @@ kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat( global uchar * src0_q, global half * src0_d, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -661,9 +661,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_v0( global uchar * src0_q, global half * src0_d, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -689,9 +689,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_v0( int i13 = im/ne12; // The number of scales is the same as the number of blocks. - int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); // Each block contains QK4_0/2 uchars, hence offset for qs is as follows. - int offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; + ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2; global uchar * x = (global uchar *) src0_q + offset0_q; global half * d = (global half *) src0_d + offset0_d; @@ -909,9 +909,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_img_v0( read_only image1d_buffer_t src0_q, read_only image1d_buffer_t src0_d, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -937,9 +937,9 @@ kernel void kernel_mul_mat_q4_0_f32_flat_img_v0( int i13 = im/ne12; // The number of scales is the same as the number of blocks. - int offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); // Each block contains QK4_0/2 uchars, hence offset for qs is as follows. - int offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset0_q = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1; @@ -1117,11 +1117,11 @@ REQD_SUBGROUP_SIZE_64 #endif kernel void kernel_mul_mv_q6_K_f32( global void * src0, - int offset0, + ulong offset0, global float * src1, - int offset1, + ulong offset1, global float * dst, - int offsetd, + ulong offsetd, int ne00, int ne01, int ne02, @@ -1152,7 +1152,7 @@ kernel void kernel_mul_mv_q6_K_f32( int i12 = im%ne12; int i13 = im/ne12; - int offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0; global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;