llama : disable FA for AMD

2024-04-24 16:48:10 +03:00 · 2024-04-24 16:48:10 +03:00 · ce281b904c
commit ce281b904c
parent 8937ec5307
3 changed files with 12 additions and 2 deletions
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@ -399,8 +399,8 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {

 #define FP16_AVAILABLE     defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
    defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
-#define FP16_MMA_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
-                                        defined(RDNA3) : __CUDA_ARCH__ >= CC_VOLTA
+
+#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA

 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
--- a/ggml-cuda/fattn.cu
+++ b/ggml-cuda/fattn.cu
@ -2,7 +2,10 @@
 #include "fattn.cuh"

 #include <cstdint>
+
+#if FP16_MMA_AVAILABLE
 #include <mma.h>
+#endif

 #define FATTN_KQ_STRIDE       256
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
--- a/llama.cpp
+++ b/llama.cpp
@ -15357,6 +15357,13 @@ struct llama_context * llama_new_context_with_model(
        cparams.flash_attn = false;
    }

+#ifdef GGML_USE_HIPBLAS
+    if (cparams.flash_attn) {
+        LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
+        cparams.flash_attn = false;
+    }
+#endif
+
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }