llama : flash_attn cparam + fix defrag
This commit is contained in:
parent
2c41180e88
commit
599ce84a71
4 changed files with 198 additions and 163 deletions
1
llama.h
1
llama.h
|
@ -270,6 +270,7 @@ extern "C" {
|
|||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue