llama: apply the mllama support patch

Signed-off-by: YiYing He <yiying@secondstate.io>
This commit is contained in:
YiYing He 2025-01-15 17:07:09 +08:00
parent cde3833239
commit 45a89e0cec
16 changed files with 440 additions and 11 deletions

View file

@ -249,6 +249,7 @@ extern "C" {
llama_token * token;
float * embd;
int32_t n_embd;
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@ -343,6 +344,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
bool cross_attn; // whether to use cross attention
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@ -443,6 +445,9 @@ extern "C" {
struct llama_context_params params),
"use llama_init_from_model instead");
// TODO: this should most likely be passed in as part of a batch and not set on the context for all batches.
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);