From 13fe28259d53c62662fe8175a09fa10c0a35c241 Mon Sep 17 00:00:00 2001
From: uvos <devnull@uvos.xyz>
Date: Sun, 23 Jun 2024 18:59:36 +0200
Subject: [PATCH] gfx908 optimizations

---
 ggml-cuda.cu         |  2 +-
 ggml-cuda/common.cuh | 11 +++++++++++
 ggml-cuda/mmq.cuh    |  6 +++++-
 ggml-cuda/mmvq.cu    | 12 ++++++++++--
 4 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f914efd71..71f0021e8 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1911,7 +1911,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 
-    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
+    const bool fp16_performance_good = min_compute_capability >= CC_GCN4;
 
 #ifdef CUDA_USE_TENSOR_CORES
     use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
index 5bd24ebe5..6f96497f5 100644
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -142,6 +142,9 @@
 #define CC_TURING     750
 #define CC_AMPERE     800
 #define CC_OFFSET_AMD 1000000
+#define CC_GCN4      (CC_OFFSET_AMD + 803)
+#define CC_VEGA      (CC_OFFSET_AMD + 900)
+#define CC_CDNA      (CC_OFFSET_AMD + 908)
 #define CC_RDNA1      (CC_OFFSET_AMD + 1010)
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
 #define CC_RDNA3      (CC_OFFSET_AMD + 1100)
@@ -233,6 +236,14 @@ typedef float2 dfloat2;
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 
+#if defined(__gfx908__) || defined(__gfx90a__)
+#define CDNA
+#endif
+
+#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
+#define GCN
+#endif
+
 #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
     defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3
diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh
index e2d07c202..d1fd7ff56 100644
--- a/ggml-cuda/mmq.cuh
+++ b/ggml-cuda/mmq.cuh
@@ -53,7 +53,11 @@ static constexpr __device__ int get_mmq_x_max_device() {
 
 static constexpr __device__ int get_mmq_y_device() {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(CDNA) || defined(GCN)
+    return 32;
+#else
     return 128;
+#endif // defined(CDNA)
 #else
 #if __CUDA_ARCH__ >= CC_VOLTA
     return 128;
@@ -1972,7 +1976,7 @@ static __device__ void mul_mat_q_process_tile(
 
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
+#if defined(RDNA3) || defined(RDNA2) || defined(CDNA)
     __launch_bounds__(WARP_SIZE*nwarps, 2)
 #endif // defined(RDNA3) || defined(RDNA2)
 #else
diff --git a/ggml-cuda/mmvq.cu b/ggml-cuda/mmvq.cu
index e8d157169..4b6569c85 100644
--- a/ggml-cuda/mmvq.cu
+++ b/ggml-cuda/mmvq.cu
@@ -56,13 +56,21 @@ static __global__ void mul_mat_vec_q(
 
     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA2) || defined(RDNA3)
     constexpr int nwarps              = 1;
     constexpr int rows_per_cuda_block = 1;
+#elif defined(CDNA)
+    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
+    constexpr int rows_per_cuda_block = ncols_y == 4 ? ncols_y : 4;
 #else
     constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
     constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
+#endif
+#else
+    constexpr int nwarps              = ncols_y <= 4 ? 4 : 2;
+    constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 
     const     int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
     const     int row0 = rows_per_cuda_block*blockIdx.x;