move norm to norm.cpp file
This commit is contained in:
parent
90e0328038
commit
a2936f40cd
5 changed files with 461 additions and 417 deletions
370
ggml-sycl/norm.cpp
Normal file
370
ggml-sycl/norm.cpp
Normal file
|
@ -0,0 +1,370 @@
|
||||||
|
#include "norm.hpp"
|
||||||
|
|
||||||
|
static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||||
|
const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||||
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||||
|
item_ct1.get_local_id(1);
|
||||||
|
const int tid = item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
const int nthreads = item_ct1.get_local_range(2);
|
||||||
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
|
assert(nwarps % WARP_SIZE == 0);
|
||||||
|
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
const float xi = x[row * ncols + col];
|
||||||
|
mean_var.x() += xi;
|
||||||
|
mean_var.y() += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums
|
||||||
|
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = mean_var;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
|
mean_var = 0.f;
|
||||||
|
int nreduce = nwarps / WARP_SIZE;
|
||||||
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
|
{
|
||||||
|
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
||||||
|
}
|
||||||
|
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float mean = mean_var.x() / ncols;
|
||||||
|
const float var = mean_var.y() / ncols - mean * mean;
|
||||||
|
const float inv_std = sycl::rsqrt(var + eps);
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps,
|
||||||
|
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||||
|
int start = item_ct1.get_group(2) * group_size;
|
||||||
|
int end = start + group_size;
|
||||||
|
const int nthreads = item_ct1.get_local_range(2);
|
||||||
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
|
assert(nwarps % WARP_SIZE == 0);
|
||||||
|
start += item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
if (end >= ne_elements) {
|
||||||
|
end = ne_elements;
|
||||||
|
}
|
||||||
|
|
||||||
|
float tmp = 0.0f; // partial sum for thread in warp
|
||||||
|
|
||||||
|
for (int j = start; j < end; j += block_size) {
|
||||||
|
tmp += x[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = tmp;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:1: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
|
||||||
|
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||||
|
better performance if there is no access to global memory.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier();
|
||||||
|
tmp = 0.f;
|
||||||
|
int nreduce = nwarps / WARP_SIZE;
|
||||||
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
|
{
|
||||||
|
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||||
|
}
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
float mean = tmp / group_size;
|
||||||
|
tmp = 0.0f;
|
||||||
|
|
||||||
|
for (int j = start; j < end; j += block_size) {
|
||||||
|
float xi = x[j] - mean;
|
||||||
|
dst[j] = xi;
|
||||||
|
tmp += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = tmp;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:2: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
|
||||||
|
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||||
|
better performance if there is no access to global memory.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier();
|
||||||
|
tmp = s_sum[lane_id];
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
float variance = tmp / group_size;
|
||||||
|
float scale = sycl::rsqrt(variance + eps);
|
||||||
|
for (int j = start; j < end; j += block_size) {
|
||||||
|
dst[j] *= scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||||
|
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||||
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||||
|
item_ct1.get_local_id(1);
|
||||||
|
const int tid = item_ct1.get_local_id(2);
|
||||||
|
const int nthreads = item_ct1.get_local_range(2);
|
||||||
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
|
assert(nwarps % WARP_SIZE == 0);
|
||||||
|
float tmp = 0.0f; // partial sum for thread in warp
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
const float xi = x[row * ncols + col];
|
||||||
|
tmp += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = tmp;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
|
int nreduce = nwarps / WARP_SIZE;
|
||||||
|
tmp = 0.f;
|
||||||
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
|
{
|
||||||
|
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||||
|
}
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float mean = tmp / ncols;
|
||||||
|
const float scale = sycl::rsqrt(mean + eps);
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
dst[row * ncols + col] = scale * x[row * ncols + col];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||||
|
const int nrows, const float eps,
|
||||||
|
queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||||
|
if (ncols < 1024) {
|
||||||
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
nullptr, WARP_SIZE);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int work_group_size = get_work_group_size(stream->get_device());
|
||||||
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
|
/*
|
||||||
|
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||||
|
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||||
|
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void group_norm_f32_sycl(const float* x, float* dst,
|
||||||
|
const int num_groups, const int group_size,
|
||||||
|
const int ne_elements, queue_ptr stream) {
|
||||||
|
static const float eps = 1e-6f;
|
||||||
|
if (group_size < 1024) {
|
||||||
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
const float eps_ct4 = eps;
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
group_norm_f32(
|
||||||
|
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
||||||
|
nullptr, WARP_SIZE);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int work_group_size = get_work_group_size(stream->get_device());
|
||||||
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
|
/*
|
||||||
|
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||||
|
cgh);
|
||||||
|
|
||||||
|
const float eps_ct4 = eps;
|
||||||
|
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
group_norm_f32(x, dst, group_size, ne_elements,
|
||||||
|
eps_ct4, item_ct1,
|
||||||
|
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||||
|
const int nrows, const float eps,
|
||||||
|
queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||||
|
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||||
|
if (ncols < 1024) {
|
||||||
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
nullptr, WARP_SIZE);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int work_group_size = get_work_group_size(stream->get_device());
|
||||||
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
|
/*
|
||||||
|
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||||
|
cgh);
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
|
||||||
|
ggml_tensor* dst, const float* src0_dd,
|
||||||
|
const float* src1_dd, float* dst_dd,
|
||||||
|
const queue_ptr& main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||||
|
|
||||||
|
(void)src1;
|
||||||
|
(void)dst;
|
||||||
|
(void)src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
int num_groups = dst->op_params[0];
|
||||||
|
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||||
|
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
||||||
|
|
||||||
|
(void)src1;
|
||||||
|
(void)dst;
|
||||||
|
(void)src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||||
|
|
||||||
|
(void)src1;
|
||||||
|
(void)dst;
|
||||||
|
(void)src1_dd;
|
||||||
|
}
|
35
ggml-sycl/norm.hpp
Normal file
35
ggml-sycl/norm.hpp
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
//
|
||||||
|
// MIT license
|
||||||
|
// Copyright (C) 2024 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef GGML_SYCL_NORM_HPP
|
||||||
|
#define GGML_SYCL_NORM_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
|
||||||
|
ggml_tensor* dst, const float* src0_dd,
|
||||||
|
const float* src1_dd, float* dst_dd,
|
||||||
|
const queue_ptr& main_stream);
|
||||||
|
|
||||||
|
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream);
|
||||||
|
|
||||||
|
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream);
|
||||||
|
|
||||||
|
#endif // GGML_SYCL_NORM_HPP
|
|
@ -74,51 +74,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg
|
||||||
const float *src1_dd, float *dst_dd,
|
const float *src1_dd, float *dst_dd,
|
||||||
const queue_ptr &main_stream);
|
const queue_ptr &main_stream);
|
||||||
|
|
||||||
static __dpct_inline__ float warp_reduce_sum(float x,
|
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
|
||||||
/*
|
|
||||||
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
|
||||||
kernel that calls this function may be less than "32". The function
|
|
||||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
|
||||||
CPU device. Modify the size of the work-group to ensure that the value
|
|
||||||
of the right-most dimension is a multiple of "32".
|
|
||||||
*/
|
|
||||||
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __dpct_inline__ sycl::float2
|
|
||||||
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
|
||||||
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
|
||||||
mask);
|
|
||||||
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
|
||||||
mask);
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __dpct_inline__ float warp_reduce_max(float x,
|
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
|
||||||
/*
|
|
||||||
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
|
||||||
kernel that calls this function may be less than "32". The function
|
|
||||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
|
||||||
CPU device. Modify the size of the work-group to ensure that the value
|
|
||||||
of the right-most dimension is a multiple of "32".
|
|
||||||
*/
|
|
||||||
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
|
||||||
item_ct1.get_sub_group(), x, mask));
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
||||||
return b;
|
return b;
|
||||||
GGML_UNUSED(a);
|
GGML_UNUSED(a);
|
||||||
|
@ -336,55 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
||||||
dst[i] = x[i] * x[i];
|
dst[i] = x[i] * x[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
|
||||||
const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) {
|
|
||||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
|
||||||
item_ct1.get_local_id(1);
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
|
|
||||||
const int nthreads = item_ct1.get_local_range(2);
|
|
||||||
const int nwarps = nthreads / WARP_SIZE;
|
|
||||||
assert(nwarps % WARP_SIZE == 0);
|
|
||||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
const float xi = x[row*ncols + col];
|
|
||||||
mean_var.x() += xi;
|
|
||||||
mean_var.y() += xi * xi;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum up partial sums
|
|
||||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = mean_var;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
||||||
mean_var = 0.f;
|
|
||||||
int nreduce = nwarps / WARP_SIZE;
|
|
||||||
for (size_t i = 0; i < nreduce; i+= 1)
|
|
||||||
{
|
|
||||||
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
|
||||||
}
|
|
||||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float mean = mean_var.x() / ncols;
|
|
||||||
const float var = mean_var.y() / ncols - mean * mean;
|
|
||||||
const float inv_std = sycl::rsqrt(var + eps);
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
int nidx = item_ct1.get_local_id(2) +
|
int nidx = item_ct1.get_local_id(2) +
|
||||||
|
@ -452,136 +358,6 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
|
|
||||||
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
|
||||||
int start = item_ct1.get_group(2) * group_size;
|
|
||||||
int end = start + group_size;
|
|
||||||
const int nthreads = item_ct1.get_local_range(2);
|
|
||||||
const int nwarps = nthreads / WARP_SIZE;
|
|
||||||
assert(nwarps % WARP_SIZE == 0);
|
|
||||||
start += item_ct1.get_local_id(2);
|
|
||||||
|
|
||||||
if (end >= ne_elements) {
|
|
||||||
end = ne_elements;
|
|
||||||
}
|
|
||||||
|
|
||||||
float tmp = 0.0f; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int j = start; j < end; j += block_size) {
|
|
||||||
tmp += x[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = tmp;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:1: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
|
|
||||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
||||||
better performance if there is no access to global memory.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier();
|
|
||||||
tmp = 0.f;
|
|
||||||
int nreduce = nwarps / WARP_SIZE;
|
|
||||||
for (size_t i = 0; i < nreduce; i += 1)
|
|
||||||
{
|
|
||||||
tmp += s_sum[lane_id + i * WARP_SIZE];
|
|
||||||
}
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
float mean = tmp / group_size;
|
|
||||||
tmp = 0.0f;
|
|
||||||
|
|
||||||
for (int j = start; j < end; j += block_size) {
|
|
||||||
float xi = x[j] - mean;
|
|
||||||
dst[j] = xi;
|
|
||||||
tmp += xi * xi;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = tmp;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:2: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
|
|
||||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
||||||
better performance if there is no access to global memory.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier();
|
|
||||||
tmp = s_sum[lane_id];
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
float variance = tmp / group_size;
|
|
||||||
float scale = sycl::rsqrt(variance + eps);
|
|
||||||
for (int j = start; j < end; j += block_size) {
|
|
||||||
dst[j] *= scale;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
|
||||||
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
|
||||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
|
||||||
item_ct1.get_local_id(1);
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
const int nthreads = item_ct1.get_local_range(2);
|
|
||||||
const int nwarps = nthreads / WARP_SIZE;
|
|
||||||
assert(nwarps % WARP_SIZE == 0);
|
|
||||||
float tmp = 0.0f; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
const float xi = x[row*ncols + col];
|
|
||||||
tmp += xi * xi;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum up partial sums
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = tmp;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
||||||
int nreduce = nwarps / WARP_SIZE;
|
|
||||||
tmp = 0.f;
|
|
||||||
for (size_t i = 0; i < nreduce; i += 1)
|
|
||||||
{
|
|
||||||
tmp += s_sum[lane_id + i * WARP_SIZE];
|
|
||||||
}
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float mean = tmp / ncols;
|
|
||||||
const float scale = sycl::rsqrt(mean + eps);
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
dst[row*ncols + col] = scale * x[row*ncols + col];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||||
|
@ -1665,92 +1441,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void norm_f32_sycl(const float *x, float *dst, const int ncols,
|
|
||||||
const int nrows, const float eps,
|
|
||||||
queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
||||||
if (ncols < 1024) {
|
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
||||||
stream->submit([&](sycl::handler& cgh) {
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
|
||||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
nullptr, WARP_SIZE);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
const int work_group_size = get_work_group_size(stream->get_device());
|
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
|
||||||
/*
|
|
||||||
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
stream->submit([&](sycl::handler& cgh) {
|
|
||||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
|
||||||
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
|
||||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void group_norm_f32_sycl(const float *x, float *dst,
|
|
||||||
const int num_groups, const int group_size,
|
|
||||||
const int ne_elements, queue_ptr stream) {
|
|
||||||
static const float eps = 1e-6f;
|
|
||||||
if (group_size < 1024) {
|
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
|
||||||
const float eps_ct4 = eps;
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
|
||||||
group_norm_f32(
|
|
||||||
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
|
||||||
nullptr, WARP_SIZE);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
const int work_group_size = get_work_group_size(stream->get_device());
|
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
|
||||||
/*
|
|
||||||
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
|
|
||||||
stream->submit([&](sycl::handler& cgh) {
|
|
||||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
|
||||||
cgh);
|
|
||||||
|
|
||||||
const float eps_ct4 = eps;
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
|
||||||
group_norm_f32(x, dst, group_size, ne_elements,
|
|
||||||
eps_ct4, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||||
const int ne0, int ne1, int ne2, int ne02,
|
const int ne0, int ne1, int ne2, int ne02,
|
||||||
queue_ptr stream) {
|
queue_ptr stream) {
|
||||||
|
@ -1792,46 +1482,6 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
|
|
||||||
const int nrows, const float eps,
|
|
||||||
queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
||||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
|
||||||
if (ncols < 1024) {
|
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
||||||
stream->submit([&](sycl::handler& cgh) {
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
|
||||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
nullptr, WARP_SIZE);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
const int work_group_size = get_work_group_size(stream->get_device());
|
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
|
||||||
/*
|
|
||||||
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
stream->submit([&](sycl::handler& cgh) {
|
|
||||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
|
||||||
cgh);
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
|
||||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
||||||
const int ky, const int kx_padded,
|
const int ky, const int kx_padded,
|
||||||
queue_ptr stream) {
|
queue_ptr stream) {
|
||||||
|
@ -2411,12 +2061,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) {
|
||||||
return user_number;
|
return user_number;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int get_work_group_size(const sycl::device& device) {
|
|
||||||
dpct::device_info prop;
|
|
||||||
dpct::get_device_info(prop, device);
|
|
||||||
return prop.get_max_work_group_size();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_check_sycl() try {
|
static void ggml_check_sycl() try {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
|
@ -2975,45 +2619,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
(void) src1_dd;
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
||||||
ggml_tensor *dst, const float *src0_dd,
|
|
||||||
const float *src1_dd, float *dst_dd,
|
|
||||||
const queue_ptr &main_stream) {
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
float eps;
|
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
|
||||||
|
|
||||||
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ggml_sycl_op_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
|
||||||
const float *src0_dd, const float *src1_dd,
|
|
||||||
float *dst_dd,
|
|
||||||
const queue_ptr &main_stream) {
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
int num_groups = dst->op_params[0];
|
|
||||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
|
||||||
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
const float *src0_dd, const float *src1_dd,
|
const float *src0_dd, const float *src1_dd,
|
||||||
|
@ -3077,28 +2682,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
(void) src1_dd;
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
|
||||||
const float *src0_dd, const float *src1_dd,
|
|
||||||
float *dst_dd,
|
|
||||||
const queue_ptr &main_stream) {
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
float eps;
|
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
|
||||||
|
|
||||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
int64_t max_compute_capability = INT_MIN;
|
int64_t max_compute_capability = INT_MIN;
|
||||||
|
|
|
@ -20,5 +20,6 @@
|
||||||
#include "mmq.hpp"
|
#include "mmq.hpp"
|
||||||
#include "mmvq.hpp"
|
#include "mmvq.hpp"
|
||||||
#include "rope.hpp"
|
#include "rope.hpp"
|
||||||
|
#include "norm.hpp"
|
||||||
|
|
||||||
#endif // GGML_SYCL_BACKEND_HPP
|
#endif // GGML_SYCL_BACKEND_HPP
|
||||||
|
|
|
@ -295,5 +295,60 @@ struct ggml_backend_sycl_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// common host functions
|
||||||
|
|
||||||
|
static inline int get_work_group_size(const sycl::device& device) {
|
||||||
|
dpct::device_info prop;
|
||||||
|
dpct::get_device_info(prop, device);
|
||||||
|
return prop.get_max_work_group_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// common device functions
|
||||||
|
|
||||||
|
static __dpct_inline__ float warp_reduce_sum(float x,
|
||||||
|
const sycl::nd_item<3>& item_ct1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
|
/*
|
||||||
|
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
||||||
|
kernel that calls this function may be less than "32". The function
|
||||||
|
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||||
|
CPU device. Modify the size of the work-group to ensure that the value
|
||||||
|
of the right-most dimension is a multiple of "32".
|
||||||
|
*/
|
||||||
|
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ sycl::float2
|
||||||
|
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
|
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
||||||
|
mask);
|
||||||
|
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
||||||
|
mask);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ float warp_reduce_max(float x,
|
||||||
|
const sycl::nd_item<3>& item_ct1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
|
/*
|
||||||
|
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
||||||
|
kernel that calls this function may be less than "32". The function
|
||||||
|
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||||
|
CPU device. Modify the size of the work-group to ensure that the value
|
||||||
|
of the right-most dimension is a multiple of "32".
|
||||||
|
*/
|
||||||
|
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
||||||
|
item_ct1.get_sub_group(), x, mask));
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
#endif // GGML_SYCL_COMMON_HPP
|
#endif // GGML_SYCL_COMMON_HPP
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue