From 6c8fdb8e5aac771ddd7e85af7b2c61a02ae46773 Mon Sep 17 00:00:00 2001 From: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Sat, 11 May 2024 21:07:59 -0600 Subject: [PATCH] adding the components of half2 seems to be compiled faster --- ggml-cuda/common.cuh | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh index 1114e6af2..a2f822e77 100644 --- a/ggml-cuda/common.cuh +++ b/ggml-cuda/common.cuh @@ -384,13 +384,24 @@ static __device__ __forceinline__ float2 warp_reduce_sum_impl_amd(float2 a) { return a; } -static __device__ __forceinline__ half2 warp_reduce_sum_impl_amd(half2 x) { - x += hip_ds_swizzleh2(x, AMD_SWIZZLE_MASK(0x1F, 0, 0x10)); - x += hip_move_dpph2(x, AMD_DPP_ROW_RR(8), 0xF, 0xF, true); - x += hip_move_dpph2(x, AMD_DPP_ROW_RR(4), 0xF, 0xF, true); - x += hip_move_dpph2(x, AMD_DPP_ROW_RR(2), 0xF, 0xF, true); - x += hip_move_dpph2(x, AMD_DPP_ROW_RR(1), 0xF, 0xF, true); - return x; +static __device__ __forceinline__ half2 warp_reduce_sum_impl_amd(half2 a) { + half2 tmp; + tmp = hip_ds_swizzleh2(a, AMD_SWIZZLE_MASK(0x1F, 0, 0x10)); + a.data.x += tmp.data.x; + a.data.y += tmp.data.y; + tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(8), 0xF, 0xF, true); + a.data.x += tmp.data.x; + a.data.y += tmp.data.y; + tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(4), 0xF, 0xF, true); + a.data.x += tmp.data.x; + a.data.y += tmp.data.y; + tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(2), 0xF, 0xF, true); + a.data.x += tmp.data.x; + a.data.y += tmp.data.y; + tmp = hip_move_dpph2(a, AMD_DPP_ROW_RR(1), 0xF, 0xF, true); + a.data.x += tmp.data.x; + a.data.y += tmp.data.y; + return a; } static __device__ __forceinline__ float warp_reduce_max_impl_amd(float x) {