Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
commit
b7ec12ebf7
182 changed files with 57387 additions and 40048 deletions
13
llama.h
13
llama.h
|
@ -40,7 +40,7 @@
|
|||
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 5
|
||||
#define LLAMA_SESSION_VERSION 6
|
||||
|
||||
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||
#define LLAMA_STATE_SEQ_VERSION 1
|
||||
|
@ -79,6 +79,11 @@ extern "C" {
|
|||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
|
@ -134,6 +139,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
@ -171,7 +177,7 @@ extern "C" {
|
|||
bool sorted;
|
||||
} llama_token_data_array;
|
||||
|
||||
typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||
|
||||
// Input data for llama_decode
|
||||
// A llama_batch object can contain input about one or many sequences
|
||||
|
@ -287,6 +293,7 @@ extern "C" {
|
|||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
|
@ -551,7 +558,7 @@ extern "C" {
|
|||
// Returns the number of used recurrent state cells (i.e. have at least one sequence assigned to them)
|
||||
LLAMA_API int32_t llama_get_rs_cache_used_cells(const struct llama_context * ctx);
|
||||
|
||||
// Clear the KV and recurrent state caches
|
||||
// Clear the KV cache and recurrent states - both cell info is erased and KV data is zeroed
|
||||
LLAMA_API void llama_cache_clear(
|
||||
struct llama_context * ctx);
|
||||
LLAMA_API DEPRECATED(void llama_kv_cache_clear(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue