musa: enable building fat binaries, enable unified memory, and disable Flash Attention on QY1 (MTT S80) (#9526)
* mtgpu: add mp_21 support Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * mtgpu: disable flash attention on qy1 (MTT S80); disable q3_k and mul_mat_batched_cublas Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * mtgpu: enable unified memory Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * mtgpu: map cublasOperation_t to mublasOperation_t (sync code to latest) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> --------- Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
This commit is contained in:
		
							parent
							
								
									912c331d3d
								
							
						
					
					
						commit
						c35e586ea5
					
				
					 6 changed files with 31 additions and 5 deletions
				
			
		|  | @ -50,6 +50,8 @@ | |||
| #define CC_RDNA1      (CC_OFFSET_AMD + 1010) | ||||
| #define CC_RDNA2      (CC_OFFSET_AMD + 1030) | ||||
| #define CC_RDNA3      (CC_OFFSET_AMD + 1100) | ||||
| #define CC_QY1        210 | ||||
| #define CC_QY2        220 | ||||
| 
 | ||||
| #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses | ||||
| 
 | ||||
|  | @ -134,6 +136,10 @@ typedef float2 dfloat2; | |||
| #define INT8_MMA_AVAILABLE | ||||
| #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING | ||||
| 
 | ||||
| #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) | ||||
| #define FLASH_ATTN_AVAILABLE | ||||
| #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) | ||||
| 
 | ||||
| static constexpr bool fast_fp16_available(const int cc) { | ||||
|     return cc >= CC_PASCAL && cc != 610; | ||||
| } | ||||
|  |  | |||
|  | @ -44,13 +44,17 @@ static __global__ void flash_attn_tile_ext_f32( | |||
|         const int ne1, | ||||
|         const int ne2, | ||||
|         const int ne3) { | ||||
| #ifndef FLASH_ATTN_AVAILABLE | ||||
|     NO_DEVICE_CODE; | ||||
|     return; | ||||
| #endif // FLASH_ATTN_AVAILABLE | ||||
|     // Skip unused kernel variants for faster compilation: | ||||
|     if (use_logit_softcap && !(D == 128 || D == 256)) { | ||||
|         NO_DEVICE_CODE; | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     //In this kernel Q, K, V are matrices while i, j, k are matrix indices. | ||||
|     // In this kernel Q, K, V are matrices while i, j, k are matrix indices. | ||||
| 
 | ||||
|     const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. | ||||
|     const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. | ||||
|  |  | |||
							
								
								
									
										2
									
								
								ggml/src/ggml-cuda/vendors/musa.h
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								ggml/src/ggml-cuda/vendors/musa.h
									
										
									
									
										vendored
									
									
								
							|  | @ -26,6 +26,7 @@ | |||
| #define cublasSetStream mublasSetStream | ||||
| #define cublasSgemm mublasSgemm | ||||
| #define cublasStatus_t mublasStatus_t | ||||
| #define cublasOperation_t mublasOperation_t | ||||
| #define cublasGetStatusString mublasStatus_to_string | ||||
| #define cudaDataType_t musaDataType_t | ||||
| #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer | ||||
|  | @ -56,6 +57,7 @@ | |||
| #define cudaLaunchHostFunc musaLaunchHostFunc | ||||
| #define cudaMalloc musaMalloc | ||||
| #define cudaMallocHost musaMallocHost | ||||
| #define cudaMallocManaged musaMallocManaged | ||||
| #define cudaMemcpy musaMemcpy | ||||
| #define cudaMemcpyAsync musaMemcpyAsync | ||||
| #define cudaMemcpyPeerAsync musaMemcpyPeerAsync | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue