llama : enable flash attn automatically when supported

2024-10-30 23:30:04 +01:00 · 2024-10-30 23:30:04 +01:00 · afc4a7de65
commit afc4a7de65
parent b9e02e8184
2 changed files with 109 additions and 55 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -3148,6 +3148,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_RWKV_WKV:
            return true;
        case GGML_OP_FLASH_ATTN_EXT: {
+            // FIXME: this is not accurate, the flash attn implementation only has kernels for a limited number of configurations,
+            //        which varies depending on too many factors to duplicate here.
 #ifndef FLASH_ATTN_AVAILABLE
            return false;
 #endif