diff --git a/ggml-sycl/norm.cpp b/ggml-sycl/norm.cpp new file mode 100644 index 000000000..a77f7852c --- /dev/null +++ b/ggml-sycl/norm.cpp @@ -0,0 +1,370 @@ +#include "norm.hpp" + +static void norm_f32(const float* x, float* dst, const int ncols, const float eps, + const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + assert(nwarps % WARP_SIZE == 0); + sycl::float2 mean_var = sycl::float2(0.f, 0.f); + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row * ncols + col]; + mean_var.x() += xi; + mean_var.y() += xi * xi; + } + + // sum up partial sums + mean_var = warp_reduce_sum(mean_var, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = mean_var; + } + /* + DPCT1118:0: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + mean_var = 0.f; + int nreduce = nwarps / WARP_SIZE; + for (size_t i = 0; i < nreduce; i += 1) + { + mean_var += s_sum[lane_id + i * WARP_SIZE]; + } + mean_var = warp_reduce_sum(mean_var, item_ct1); + } + + const float mean = mean_var.x() / ncols; + const float var = mean_var.y() / ncols - mean * mean; + const float inv_std = sycl::rsqrt(var + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std; + } +} + +static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps, + const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { + int start = item_ct1.get_group(2) * group_size; + int end = start + group_size; + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + assert(nwarps % WARP_SIZE == 0); + start += item_ct1.get_local_id(2); + + if (end >= ne_elements) { + end = ne_elements; + } + + float tmp = 0.0f; // partial sum for thread in warp + + for (int j = start; j < end; j += block_size) { + tmp += x[j]; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:1: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:54: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = 0.f; + int nreduce = nwarps / WARP_SIZE; + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += s_sum[lane_id + i * WARP_SIZE]; + } + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float mean = tmp / group_size; + tmp = 0.0f; + + for (int j = start; j < end; j += block_size) { + float xi = x[j] - mean; + dst[j] = xi; + tmp += xi * xi; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:2: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:55: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float variance = tmp / group_size; + float scale = sycl::rsqrt(variance + eps); + for (int j = start; j < end; j += block_size) { + dst[j] *= scale; + } +} + +static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps, + const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + assert(nwarps % WARP_SIZE == 0); + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row * ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:3: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + int nreduce = nwarps / WARP_SIZE; + tmp = 0.f; + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += s_sum[lane_id + i * WARP_SIZE]; + } + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float mean = tmp / ncols; + const float scale = sycl::rsqrt(mean + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row * ncols + col] = scale * x[row * ncols + col]; + } +} + +static void norm_f32_sycl(const float* x, float* dst, const int ncols, + const int nrows, const float eps, + queue_ptr stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + norm_f32(x, dst, ncols, eps, item_ct1, + nullptr, WARP_SIZE); + }); + }); + } + else { + const int work_group_size = get_work_group_size(stream->get_device()); + const sycl::range<3> block_dims(1, 1, work_group_size); + /* + DPCT1049:17: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor s_sum_acc_ct1( + sycl::range<1>(work_group_size / WARP_SIZE), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + norm_f32(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer(), work_group_size); + }); + }); + } +} + +static void group_norm_f32_sycl(const float* x, float* dst, + const int num_groups, const int group_size, + const int ne_elements, queue_ptr stream) { + static const float eps = 1e-6f; + if (group_size < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler& cgh) { + const float eps_ct4 = eps; + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + group_norm_f32( + x, dst, group_size, ne_elements, eps_ct4, item_ct1, + nullptr, WARP_SIZE); + }); + }); + } + else { + const int work_group_size = get_work_group_size(stream->get_device()); + const sycl::range<3> block_dims(1, 1, work_group_size); + /* + DPCT1049:18: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), + cgh); + + const float eps_ct4 = eps; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + group_norm_f32(x, dst, group_size, ne_elements, + eps_ct4, item_ct1, + s_sum_acc_ct1.get_pointer(), work_group_size); + }); + }); + } +} + +static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, + const int nrows, const float eps, + queue_ptr stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + rms_norm_f32(x, dst, ncols, eps, item_ct1, + nullptr, WARP_SIZE); + }); + }); + } + else { + const int work_group_size = get_work_group_size(stream->get_device()); + const sycl::range<3> block_dims(1, 1, work_group_size); + /* + DPCT1049:19: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), + cgh); + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + rms_norm_f32(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer(), work_group_size); + }); + }); + } +} + +void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst, const float* src0_dd, + const float* src1_dd, float* dst_dd, + const queue_ptr& main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream); + + (void)src1; + (void)dst; + (void)src1_dd; +} + +void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + int num_groups = dst->op_params[0]; + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream); + + (void)src1; + (void)dst; + (void)src1_dd; +} + +void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream); + + (void)src1; + (void)dst; + (void)src1_dd; +} diff --git a/ggml-sycl/norm.hpp b/ggml-sycl/norm.hpp new file mode 100644 index 000000000..a9ad9156f --- /dev/null +++ b/ggml-sycl/norm.hpp @@ -0,0 +1,35 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_NORM_HPP +#define GGML_SYCL_NORM_HPP + +#include "common.hpp" + +void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst, const float* src0_dd, + const float* src1_dd, float* dst_dd, + const queue_ptr& main_stream); + +void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream); + +void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream); + +#endif // GGML_SYCL_NORM_HPP diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 5a716d0ee..110f99e54 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -74,51 +74,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); -static __dpct_inline__ float warp_reduce_sum(float x, - const sycl::nd_item<3> &item_ct1) { -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - /* - DPCT1096:98: The right-most dimension of the work-group used in the SYCL - kernel that calls this function may be less than "32". The function - "dpct::permute_sub_group_by_xor" may return an unexpected result on the - CPU device. Modify the size of the work-group to ensure that the value - of the right-most dimension is a multiple of "32". - */ - x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); - } - return x; -} - -static __dpct_inline__ sycl::float2 -warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) { -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), - mask); - a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), - mask); - } - return a; -} - -static __dpct_inline__ float warp_reduce_max(float x, - const sycl::nd_item<3> &item_ct1) { -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - /* - DPCT1096:97: The right-most dimension of the work-group used in the SYCL - kernel that calls this function may be less than "32". The function - "dpct::permute_sub_group_by_xor" may return an unexpected result on the - CPU device. Modify the size of the work-group to ensure that the value - of the right-most dimension is a multiple of "32". - */ - x = sycl::fmax(x, dpct::permute_sub_group_by_xor( - item_ct1.get_sub_group(), x, mask)); - } - return x; -} - static __dpct_inline__ float op_repeat(const float a, const float b) { return b; GGML_UNUSED(a); @@ -336,55 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k, dst[i] = x[i] * x[i]; } -static void norm_f32(const float * x, float * dst, const int ncols, const float eps, - const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - const int tid = item_ct1.get_local_id(2); - - const int nthreads = item_ct1.get_local_range(2); - const int nwarps = nthreads / WARP_SIZE; - assert(nwarps % WARP_SIZE == 0); - sycl::float2 mean_var = sycl::float2(0.f, 0.f); - - for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; - mean_var.x() += xi; - mean_var.y() += xi * xi; - } - - // sum up partial sums - mean_var = warp_reduce_sum(mean_var, item_ct1); - if (block_size > WARP_SIZE) { - - int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; - int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = mean_var; - } - /* - DPCT1118:0: SYCL group functions and algorithms must be encountered in - converged control flow. You may need to adjust the code. - */ - item_ct1.barrier(sycl::access::fence_space::local_space); - mean_var = 0.f; - int nreduce = nwarps / WARP_SIZE; - for (size_t i = 0; i < nreduce; i+= 1) - { - mean_var += s_sum[lane_id + i * WARP_SIZE]; - } - mean_var = warp_reduce_sum(mean_var, item_ct1); - } - - const float mean = mean_var.x() / ncols; - const float var = mean_var.y() / ncols - mean * mean; - const float inv_std = sycl::rsqrt(var + eps); - - for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; - } -} - static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02, const sycl::nd_item<3> &item_ct1) { int nidx = item_ct1.get_local_id(2) + @@ -452,136 +358,6 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, } } -static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps, - const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) { - int start = item_ct1.get_group(2) * group_size; - int end = start + group_size; - const int nthreads = item_ct1.get_local_range(2); - const int nwarps = nthreads / WARP_SIZE; - assert(nwarps % WARP_SIZE == 0); - start += item_ct1.get_local_id(2); - - if (end >= ne_elements) { - end = ne_elements; - } - - float tmp = 0.0f; // partial sum for thread in warp - - for (int j = start; j < end; j += block_size) { - tmp += x[j]; - } - - tmp = warp_reduce_sum(tmp, item_ct1); - if (block_size > WARP_SIZE) { - - int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; - int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - /* - DPCT1118:1: SYCL group functions and algorithms must be encountered in - converged control flow. You may need to adjust the code. - */ - /* - DPCT1065:54: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance if there is no access to global memory. - */ - item_ct1.barrier(); - tmp = 0.f; - int nreduce = nwarps / WARP_SIZE; - for (size_t i = 0; i < nreduce; i += 1) - { - tmp += s_sum[lane_id + i * WARP_SIZE]; - } - tmp = warp_reduce_sum(tmp, item_ct1); - } - - float mean = tmp / group_size; - tmp = 0.0f; - - for (int j = start; j < end; j += block_size) { - float xi = x[j] - mean; - dst[j] = xi; - tmp += xi * xi; - } - - tmp = warp_reduce_sum(tmp, item_ct1); - if (block_size > WARP_SIZE) { - - int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; - int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - /* - DPCT1118:2: SYCL group functions and algorithms must be encountered in - converged control flow. You may need to adjust the code. - */ - /* - DPCT1065:55: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance if there is no access to global memory. - */ - item_ct1.barrier(); - tmp = s_sum[lane_id]; - tmp = warp_reduce_sum(tmp, item_ct1); - } - - float variance = tmp / group_size; - float scale = sycl::rsqrt(variance + eps); - for (int j = start; j < end; j += block_size) { - dst[j] *= scale; - } -} - -static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps, - const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) { - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - const int tid = item_ct1.get_local_id(2); - const int nthreads = item_ct1.get_local_range(2); - const int nwarps = nthreads / WARP_SIZE; - assert(nwarps % WARP_SIZE == 0); - float tmp = 0.0f; // partial sum for thread in warp - - for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; - tmp += xi * xi; - } - - // sum up partial sums - tmp = warp_reduce_sum(tmp, item_ct1); - if (block_size > WARP_SIZE) { - - int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; - int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - /* - DPCT1118:3: SYCL group functions and algorithms must be encountered in - converged control flow. You may need to adjust the code. - */ - item_ct1.barrier(sycl::access::fence_space::local_space); - int nreduce = nwarps / WARP_SIZE; - tmp = 0.f; - for (size_t i = 0; i < nreduce; i += 1) - { - tmp += s_sum[lane_id + i * WARP_SIZE]; - } - tmp = warp_reduce_sum(tmp, item_ct1); - } - - const float mean = tmp / ncols; - const float scale = sycl::rsqrt(mean + eps); - - for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = scale * x[row*ncols + col]; - } -} - static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, const sycl::nd_item<3> &item_ct1) { const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -1665,92 +1441,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k, }); } -static void norm_f32_sycl(const float *x, float *dst, const int ncols, - const int nrows, const float eps, - queue_ptr stream) { - GGML_ASSERT(ncols % WARP_SIZE == 0); - if (ncols < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - stream->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, eps, item_ct1, - nullptr, WARP_SIZE); - }); - }); - } else { - const int work_group_size = get_work_group_size(stream->get_device()); - const sycl::range<3> block_dims(1, 1, work_group_size); - /* - DPCT1049:17: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - stream->submit([&](sycl::handler& cgh) { - sycl::local_accessor s_sum_acc_ct1( - sycl::range<1>(work_group_size / WARP_SIZE), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, eps, item_ct1, - s_sum_acc_ct1.get_pointer(), work_group_size); - }); - }); - } -} - -static void group_norm_f32_sycl(const float *x, float *dst, - const int num_groups, const int group_size, - const int ne_elements, queue_ptr stream) { - static const float eps = 1e-6f; - if (group_size < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - const float eps_ct4 = eps; - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - group_norm_f32( - x, dst, group_size, ne_elements, eps_ct4, item_ct1, - nullptr, WARP_SIZE); - }); - }); - } else { - const int work_group_size = get_work_group_size(stream->get_device()); - const sycl::range<3> block_dims(1, 1, work_group_size); - /* - DPCT1049:18: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - - stream->submit([&](sycl::handler& cgh) { - sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), - cgh); - - const float eps_ct4 = eps; - - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - group_norm_f32(x, dst, group_size, ne_elements, - eps_ct4, item_ct1, - s_sum_acc_ct1.get_pointer(), work_group_size); - }); - }); - } -} - static void concat_f32_sycl(const float *x, const float *y, float *dst, const int ne0, int ne1, int ne2, int ne02, queue_ptr stream) { @@ -1792,46 +1482,6 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00, }); } -static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols, - const int nrows, const float eps, - queue_ptr stream) { - GGML_ASSERT(ncols % WARP_SIZE == 0); - // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); - if (ncols < 1024) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - stream->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, eps, item_ct1, - nullptr, WARP_SIZE); - }); - }); - } else { - const int work_group_size = get_work_group_size(stream->get_device()); - const sycl::range<3> block_dims(1, 1, work_group_size); - /* - DPCT1049:19: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - stream->submit([&](sycl::handler& cgh) { - sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), - cgh); - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, - block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, eps, item_ct1, - s_sum_acc_ct1.get_pointer(), work_group_size); - }); - }); - } -} - static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, const int ky, const int kx_padded, queue_ptr stream) { @@ -2411,12 +2061,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) { return user_number; } -static inline int get_work_group_size(const sycl::device& device) { - dpct::device_info prop; - dpct::get_device_info(prop, device); - return prop.get_max_work_group_size(); -} - static void ggml_check_sycl() try { static bool initialized = false; @@ -2975,45 +2619,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor (void) src1_dd; } -inline void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - -inline void ggml_sycl_op_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - int num_groups = dst->op_params[0]; - int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, @@ -3077,28 +2682,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor (void) src1_dd; } -inline void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { int64_t min_compute_capability = INT_MAX; int64_t max_compute_capability = INT_MIN; diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index d5a63cd71..3afa33919 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -20,5 +20,6 @@ #include "mmq.hpp" #include "mmvq.hpp" #include "rope.hpp" +#include "norm.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index e01f91633..dfd4a7c2c 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -295,5 +295,60 @@ struct ggml_backend_sycl_context { } }; +// common host functions + +static inline int get_work_group_size(const sycl::device& device) { + dpct::device_info prop; + dpct::get_device_info(prop, device); + return prop.get_max_work_group_size(); +} + + +// common device functions + +static __dpct_inline__ float warp_reduce_sum(float x, + const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + /* + DPCT1096:98: The right-most dimension of the work-group used in the SYCL + kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); + } + return x; +} + +static __dpct_inline__ sycl::float2 +warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), + mask); + a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), + mask); + } + return a; +} + +static __dpct_inline__ float warp_reduce_max(float x, + const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + /* + DPCT1096:97: The right-most dimension of the work-group used in the SYCL + kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x = sycl::fmax(x, dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), x, mask)); + } + return x; +} #endif // GGML_SYCL_COMMON_HPP