llama : advanced batch splits

This includes equal-sequence-length batch splits which are useful to simplify recurrent model operators. * llama : always make recurrent state slots contiguous * ggml : simplify mamba operators
2024-07-16 20:33:45 -04:00 · 2024-07-16 20:33:45 -04:00 · c51daefc32
commit c51daefc32
parent a38b884c6c
3 changed files with 1056 additions and 643 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -1760,10 +1760,8 @@ extern "C" {

    GGML_API struct ggml_tensor * ggml_ssm_conv(
            struct ggml_context * ctx,
-            struct ggml_tensor  * s,
-            struct ggml_tensor  * x,
-            struct ggml_tensor  * c,
-            struct ggml_tensor  * sq);
+            struct ggml_tensor  * sx,
+            struct ggml_tensor  * c);

    GGML_API struct ggml_tensor * ggml_ssm_scan(
            struct ggml_context * ctx,
@ -1772,8 +1770,7 @@ extern "C" {
            struct ggml_tensor  * dt,
            struct ggml_tensor  * A,
            struct ggml_tensor  * B,
-            struct ggml_tensor  * C,
-            struct ggml_tensor  * sq);
+            struct ggml_tensor  * C);

    // partition into non-overlapping windows with padding if needed
    // example: