adding the components of half2 seems to be compiled faster
This commit is contained in:
parent
9e6f2e2aff
commit
6c8fdb8e5a
1 changed files with 18 additions and 7 deletions
|
@ -384,13 +384,24 @@ static __device__ __forceinline__ float2 warp_reduce_sum_impl_amd(float2 a) {
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_sum_impl_amd(half2 x) {
|
static __device__ __forceinline__ half2 warp_reduce_sum_impl_amd(half2 a) {
|
||||||
x += hip_ds_swizzleh2(x, AMD_SWIZZLE_MASK(0x1F, 0, 0x10));
|
half2 tmp;
|
||||||
x += hip_move_dpph2(x, AMD_DPP_ROW_RR(8), 0xF, 0xF, true);
|
tmp = hip_ds_swizzleh2(a, AMD_SWIZZLE_MASK(0x1F, 0, 0x10));
|
||||||
x += hip_move_dpph2(x, AMD_DPP_ROW_RR(4), 0xF, 0xF, true);
|
a.data.x += tmp.data.x;
|
||||||
x += hip_move_dpph2(x, AMD_DPP_ROW_RR(2), 0xF, 0xF, true);
|
a.data.y += tmp.data.y;
|
||||||
x += hip_move_dpph2(x, AMD_DPP_ROW_RR(1), 0xF, 0xF, true);
|
tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(8), 0xF, 0xF, true);
|
||||||
return x;
|
a.data.x += tmp.data.x;
|
||||||
|
a.data.y += tmp.data.y;
|
||||||
|
tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(4), 0xF, 0xF, true);
|
||||||
|
a.data.x += tmp.data.x;
|
||||||
|
a.data.y += tmp.data.y;
|
||||||
|
tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(2), 0xF, 0xF, true);
|
||||||
|
a.data.x += tmp.data.x;
|
||||||
|
a.data.y += tmp.data.y;
|
||||||
|
tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(1), 0xF, 0xF, true);
|
||||||
|
a.data.x += tmp.data.x;
|
||||||
|
a.data.y += tmp.data.y;
|
||||||
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_max_impl_amd(float x) {
|
static __device__ __forceinline__ float warp_reduce_max_impl_amd(float x) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue