llama : avoid ggml_cast, use F32 query
This commit is contained in:
parent
40ea8cd1ac
commit
f9ca5dcbe8
6 changed files with 44 additions and 17 deletions
4
ggml.h
4
ggml.h
|
@ -1633,6 +1633,10 @@ extern "C" {
|
|||
struct ggml_tensor * mask,
|
||||
float scale);
|
||||
|
||||
GGML_API void ggml_flash_attn_ext_set_prec(
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue