Allow all RDNA2 archs to use sdot4 intrinsic (#8629)
The check gating the use of `__builtin_amdgc_sdot4` specifically checks for gfx1030. This causes a severe perf regression for anything gfx103? that's not gfx1030 and not using `HSA_OVERRIDE_GFX_VERSION` (if you've built ROCm to support it). We already have a generic RDNA2 define, let's use it.
This commit is contained in:
		
							parent
							
								
									e7e6487ba0
								
							
						
					
					
						commit
						46e47417aa
					
				
					 1 changed files with 1 additions and 1 deletions
				
			
		|  | @ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half | |||
| 
 | ||||
| static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { | ||||
| #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) | ||||
| #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) | ||||
| #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) | ||||
|     c = __builtin_amdgcn_sdot4(a, b, c, false); | ||||
| #elif defined(RDNA3) | ||||
|     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue