fix mask of permute_sub_group_by_xor
This commit is contained in:
parent
e18bf85db8
commit
a85ad06eb4
3 changed files with 19 additions and 19 deletions
|
@ -584,7 +584,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
||||||
float sum = xi;
|
float sum = xi;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
|
amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
|
||||||
item_ct1.get_sub_group(), amax, mask));
|
item_ct1.get_sub_group(), amax, mask));
|
||||||
sum +=
|
sum +=
|
||||||
|
@ -728,7 +728,7 @@ static void mul_mat_p021_f16_f32(
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -781,7 +781,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,7 +76,7 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -227,7 +227,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -346,7 +346,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -499,7 +499,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -633,7 +633,7 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -748,7 +748,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,7 +37,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -85,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -133,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -181,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -229,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -277,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -325,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -373,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -421,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -470,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue