From 13fe28259d53c62662fe8175a09fa10c0a35c241 Mon Sep 17 00:00:00 2001 From: uvos Date: Sun, 23 Jun 2024 18:59:36 +0200 Subject: [PATCH] gfx908 optimizations --- ggml-cuda.cu | 2 +- ggml-cuda/common.cuh | 11 +++++++++++ ggml-cuda/mmq.cuh | 6 +++++- ggml-cuda/mmvq.cu | 12 ++++++++++-- 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f914efd71..71f0021e8 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1911,7 +1911,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - const bool fp16_performance_good = min_compute_capability >= CC_RDNA1; + const bool fp16_performance_good = min_compute_capability >= CC_GCN4; #ifdef CUDA_USE_TENSOR_CORES use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3; diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh index 5bd24ebe5..6f96497f5 100644 --- a/ggml-cuda/common.cuh +++ b/ggml-cuda/common.cuh @@ -142,6 +142,9 @@ #define CC_TURING 750 #define CC_AMPERE 800 #define CC_OFFSET_AMD 1000000 +#define CC_GCN4 (CC_OFFSET_AMD + 803) +#define CC_VEGA (CC_OFFSET_AMD + 900) +#define CC_CDNA (CC_OFFSET_AMD + 908) #define CC_RDNA1 (CC_OFFSET_AMD + 1010) #define CC_RDNA2 (CC_OFFSET_AMD + 1030) #define CC_RDNA3 (CC_OFFSET_AMD + 1100) @@ -233,6 +236,14 @@ typedef float2 dfloat2; #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 +#if defined(__gfx908__) || defined(__gfx90a__) +#define CDNA +#endif + +#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) +#define GCN +#endif + #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ defined(__gfx1150__) || defined(__gfx1151__) #define RDNA3 diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh index e2d07c202..d1fd7ff56 100644 --- a/ggml-cuda/mmq.cuh +++ b/ggml-cuda/mmq.cuh @@ -53,7 +53,11 @@ static constexpr __device__ int get_mmq_x_max_device() { static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(CDNA) || defined(GCN) + return 32; +#else return 128; +#endif // defined(CDNA) #else #if __CUDA_ARCH__ >= CC_VOLTA return 128; @@ -1972,7 +1976,7 @@ static __device__ void mul_mat_q_process_tile( template #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) +#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) __launch_bounds__(WARP_SIZE*nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) #else diff --git a/ggml-cuda/mmvq.cu b/ggml-cuda/mmvq.cu index e8d157169..4b6569c85 100644 --- a/ggml-cuda/mmvq.cu +++ b/ggml-cuda/mmvq.cu @@ -56,13 +56,21 @@ static __global__ void mul_mat_vec_q( constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA2) || defined(RDNA3) constexpr int nwarps = 1; constexpr int rows_per_cuda_block = 1; +#elif defined(CDNA) + constexpr int nwarps = ncols_y <= 4 ? 4 : 2; + constexpr int rows_per_cuda_block = ncols_y == 4 ? ncols_y : 4; #else constexpr int nwarps = ncols_y <= 4 ? 4 : 2; constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) +#endif +#else + constexpr int nwarps = ncols_y <= 4 ? 4 : 2; + constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; const int row0 = rows_per_cuda_block*blockIdx.x;