Merge branch 'master' into compilade/refactor-kv-cache

2024-05-12 17:13:31 -04:00 · 2024-05-12 17:13:31 -04:00 · b7ec12ebf7
commit b7ec12ebf7
parent b6fafd1747 dc685be466
182 changed files with 57387 additions and 40048 deletions
--- a/llama.h
+++ b/llama.h
@ -40,7 +40,7 @@
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 5
+#define LLAMA_SESSION_VERSION 6

 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 1
@ -79,6 +79,11 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
+        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
    };

    // note: these values should be synchronized with ggml_rope
@ -134,6 +139,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@ -171,7 +177,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;

-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void * user_data);

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -287,6 +293,7 @@ extern "C" {
        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool flash_attn;  // whether to use flash attention

        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
@ -551,7 +558,7 @@ extern "C" {
    // Returns the number of used recurrent state cells (i.e. have at least one sequence assigned to them)
    LLAMA_API int32_t llama_get_rs_cache_used_cells(const struct llama_context * ctx);

-    // Clear the KV and recurrent state caches
+    // Clear the KV cache and recurrent states - both cell info is erased and KV data is zeroed
    LLAMA_API void llama_cache_clear(
            struct llama_context * ctx);
    LLAMA_API DEPRECATED(void llama_kv_cache_clear(