llama : rename many llama_kv_cache_* functions

2024-04-29 10:24:45 -04:00 · 2024-04-29 10:24:45 -04:00 · a09db95eab
commit a09db95eab
parent d66849f628
2 changed files with 138 additions and 45 deletions
--- a/llama.h
+++ b/llama.h
@ -515,6 +515,12 @@ extern "C" {
    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);

+    // Rebuild and check the validity of the recurrent state cache's tree of sequences.
+    // (slow, use only for debugging purposes)
+    // Returns whether or not the rs cache was valid.
+    // The errors are always corrected, but only logged when debug is true.
+    LLAMA_API bool llama_rs_cache_rebuild(struct llama_context * ctx, bool debug);
+
    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
@ -522,36 +528,60 @@ extern "C" {
    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);

-    // Clear the KV cache
-    LLAMA_API void llama_kv_cache_clear(
+    // Returns the number of used recurrent state cells (i.e. have at least one sequence assigned to them)
+    LLAMA_API int32_t llama_get_rs_cache_used_cells(const struct llama_context * ctx);
+
+    // Clear the KV and recurrent state caches
+    LLAMA_API void llama_cache_clear(
            struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_cache_clear(
+            struct llama_context * ctx),
+        "use llama_cache_clear instead");

    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
+    // Returns n_past
+    LLAMA_API llama_pos llama_cache_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1);
+    LLAMA_API DEPRECATED(bool llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1),
+        "use llama_cache_seq_rm instead, and handle its return value for partial removals");

    // Copy all tokens that belong to the specified sequence to another sequence
-    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+    // Note that this does not allocate extra KV or RS cache memory - it simply assigns the tokens to the new sequence
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
+    // Returns n_past
+    LLAMA_API llama_pos llama_cache_seq_cp(
            struct llama_context * ctx,
                    llama_seq_id   seq_id_src,
                    llama_seq_id   seq_id_dst,
                       llama_pos   p0,
                       llama_pos   p1);
+    LLAMA_API DEPRECATED(void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1),
+        "use llama_cache_seq_cp instead, and handle its return value for partial copies");

    // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
+    LLAMA_API void llama_cache_seq_keep(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);
+    LLAMA_API DEPRECATED(void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+        "use llama_cache_seq_keep instead");

    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
    // If the KV cache is RoPEd, the KV data is updated accordingly:
@ -559,12 +589,20 @@ extern "C" {
    //   - explicitly with llama_kv_cache_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
+    // Returns n_past
+    LLAMA_API llama_pos llama_cache_seq_add(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
                       llama_pos   delta);
+    LLAMA_API DEPRECATED(void llama_kv_cache_seq_add(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta),
+        "use llama_cache_seq_add instead");

    // Integer division of the positions by factor of `d > 1`
    // If the KV cache is RoPEd, the KV data is updated accordingly:
@ -572,17 +610,29 @@ extern "C" {
    //   - explicitly with llama_kv_cache_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
+    // Returns n_past
+    LLAMA_API llama_pos llama_cache_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
                             int   d);
+    LLAMA_API DEPRECATED(void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d),
+        "use llama_cache_seq_div instead");

-    // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+    // Returns the largest position present in the KV and/or RS cache for the specified sequence
+    LLAMA_API llama_pos llama_cache_seq_pos_max(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);
+    LLAMA_API DEPRECATED(llama_pos llama_kv_cache_seq_pos_max(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id),
+        "use llama_cache_seq_pos_max instead, which also now returns -1 instead of 0 when the seq_id has no cells");

    // Defragment the KV cache
    // This will be applied: