llama : enable flash attn automatically when supported
This commit is contained in:
parent
b9e02e8184
commit
afc4a7de65
2 changed files with 109 additions and 55 deletions
|
@ -3148,6 +3148,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
|||
case GGML_OP_RWKV_WKV:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT: {
|
||||
// FIXME: this is not accurate, the flash attn implementation only has kernels for a limited number of configurations,
|
||||
// which varies depending on too many factors to duplicate here.
|
||||
#ifndef FLASH_ATTN_AVAILABLE
|
||||
return false;
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue