llama : enable flash attn automatically when supported

This commit is contained in:
slaren 2024-10-30 23:30:04 +01:00
parent b9e02e8184
commit afc4a7de65
2 changed files with 109 additions and 55 deletions

View file

@ -3148,6 +3148,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_RWKV_WKV:
return true;
case GGML_OP_FLASH_ATTN_EXT: {
// FIXME: this is not accurate, the flash attn implementation only has kernels for a limited number of configurations,
// which varies depending on too many factors to duplicate here.
#ifndef FLASH_ATTN_AVAILABLE
return false;
#endif