Merge branch 'master' into compilade/mamba2

2024-09-18 09:13:46 -04:00 · 2024-09-18 09:13:46 -04:00 · 0e601cafe9
commit 0e601cafe9
parent 38913dc8dd 6443ddd985
197 changed files with 25587 additions and 16246 deletions
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@ -7,8 +7,8 @@ extern "C" {
 #endif

 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;

 // Tensor allocator
 struct ggml_tallocr {
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -63,6 +63,7 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

+    // "offset" refers to the offset of the tensor data for setting/getting data
    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

@ -102,6 +103,7 @@ extern "C" {

    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

    // Create a backend buffer from an existing pointer
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
 */
 GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);

+/**
+ * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
+ *
+ * @return A pointer to the host buffer type interface.
+ */
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -220,7 +220,7 @@
 #include <stdio.h>

 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
+#define GGML_FILE_VERSION 2

 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@ -231,6 +231,8 @@
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
+#define GGML_MAX_N_THREADS      512
+
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@ -356,6 +358,7 @@ extern "C" {

    struct ggml_object;
    struct ggml_context;
+    struct ggml_cgraph;

    // NOTE: always add types at the end of the enum to keep backward compatibility
    enum ggml_type {
@ -393,6 +396,8 @@ extern "C" {
        GGML_TYPE_Q4_0_4_4 = 31,
        GGML_TYPE_Q4_0_4_8 = 32,
        GGML_TYPE_Q4_0_8_8 = 33,
+        GGML_TYPE_TQ1_0   = 34,
+        GGML_TYPE_TQ2_0   = 35,
        GGML_TYPE_COUNT,
    };

@ -453,6 +458,8 @@ extern "C" {
        GGML_OP_SQR,
        GGML_OP_SQRT,
        GGML_OP_LOG,
+        GGML_OP_SIN,
+        GGML_OP_COS,
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
@ -490,9 +497,11 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
+        GGML_OP_IM2COL_BACK,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
+        GGML_OP_POOL_2D_BACK,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARANGE,
@ -508,6 +517,7 @@ extern "C" {
        GGML_OP_WIN_UNPART,
        GGML_OP_GET_REL_POS,
        GGML_OP_ADD_REL_POS,
+        GGML_OP_RWKV_WKV,

        GGML_OP_UNARY,

@ -542,6 +552,7 @@ extern "C" {
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
+        GGML_UNARY_OP_EXP,

        GGML_UNARY_OP_COUNT,
    };
@ -553,10 +564,11 @@ extern "C" {
    };

    enum ggml_log_level {
-        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_INFO  = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
+        GGML_LOG_LEVEL_NONE  = 0,
+        GGML_LOG_LEVEL_INFO  = 1,
+        GGML_LOG_LEVEL_WARN  = 2,
+        GGML_LOG_LEVEL_ERROR = 3,
+        GGML_LOG_LEVEL_DEBUG = 4,
    };

    enum ggml_tensor_flag {
@ -565,23 +577,9 @@ extern "C" {
        GGML_TENSOR_FLAG_PARAM  = 4,
    };

-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type         type;
+        enum ggml_type type;

        GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");

@ -624,6 +622,29 @@ extern "C" {
    // If it returns true, the computation is aborted
    typedef bool (*ggml_abort_callback)(void * data);

+    // Scheduling priorities
+    enum ggml_sched_priority {
+        GGML_SCHED_PRIO_NORMAL,
+        GGML_SCHED_PRIO_MEDIUM,
+        GGML_SCHED_PRIO_HIGH,
+        GGML_SCHED_PRIO_REALTIME
+    };
+
+    // Threadpool params
+    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+    struct ggml_threadpool_params {
+        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
+
+    struct ggml_threadpool;     // forward declaration, see ggml.c
+
+    typedef struct ggml_threadpool * ggml_threadpool_t;
+
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -631,41 +652,13 @@ extern "C" {
        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

        int n_threads;
+        struct ggml_threadpool * threadpool;

        // abort ggml_graph_compute when true
        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
    };

-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    typedef uint32_t ggml_bitset_t;
-
-    struct ggml_hash_set {
-        size_t size;
-        ggml_bitset_t * used;
-        struct ggml_tensor ** keys;
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_set;
-
-        enum ggml_cgraph_eval_order order;
-    };
-
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@ -969,6 +962,22 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_sin(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sin_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_cos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_cos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // return scalar
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
@ -1119,6 +1128,14 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_exp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_exp_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@ -1214,7 +1231,7 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes

    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_inplace(
@ -1224,19 +1241,19 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes

    GGML_API struct ggml_tensor * ggml_set_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes

    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes

    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set_2d(
@ -1244,7 +1261,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes

    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@ -1252,7 +1269,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes

    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
@ -1566,34 +1583,49 @@ extern "C" {
            float                 min,
            float                 max);

+    // im2col
+    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
    GGML_API struct ggml_tensor * ggml_im2col(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1,
-            bool                 is_2D,
-            enum ggml_type       dst_type);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s0, // stride dimension 0
+            int                   s1, // stride dimension 1
+            int                   p0, // padding dimension 0
+            int                   p1, // padding dimension 1
+            int                   d0, // dilation dimension 0
+            int                   d1, // dilation dimension 1
+            bool                  is_2D,
+            enum ggml_type        dst_type);
+
+    GGML_API struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,  // convolution kernel
+        struct ggml_tensor  * b,  // gradient of im2col output
+        int64_t             * ne, // shape of im2col input
+        int                   s0, // stride dimension 0
+        int                   s1, // stride dimension 1
+        int                   p0, // padding dimension 0
+        int                   p1, // padding dimension 1
+        int                   d0, // dilation dimension 0
+        int                   d1, // dilation dimension 1
+        bool                  is_2D);

    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1

    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
            int                   s0,  // stride
            int                   p0,  // padding
            int                   d0); // dilation
@ -1602,29 +1634,29 @@ extern "C" {
    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                   s,  // stride
+            int                   d); // dilation

    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation

    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1


    // kernel size is a->ne[0] x a->ne[1]
@ -1686,6 +1718,18 @@ extern "C" {
            float                 p0,
            float                 p1);

+    GGML_API struct ggml_tensor * ggml_pool_2d_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * af, // "a"/input used in forward pass
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
+
    // nearest interpolate
    // multiplies ne0 and ne1 by scale factor
    // used in stable-diffusion
@ -1760,7 +1804,8 @@ extern "C" {
            struct ggml_tensor  * v,
            struct ggml_tensor  * mask,
            float                 scale,
-            float                 max_bias);
+            float                 max_bias,
+            float                 logit_softcap);

    GGML_API void ggml_flash_attn_ext_set_prec(
            struct ggml_tensor * a,
@ -1840,6 +1885,15 @@ extern "C" {
            struct ggml_tensor  * pw,
            struct ggml_tensor  * ph);

+    GGML_API struct ggml_tensor * ggml_rwkv_wkv(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * r,
+            struct ggml_tensor  * tf,
+            struct ggml_tensor  * td,
+            struct ggml_tensor  * state);
+
    // custom operators

    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@ -1923,8 +1977,6 @@ extern "C" {
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);

-    #define GGML_N_TASKS_MAX -1
-
    GGML_API struct ggml_tensor * ggml_map_custom1(
            struct ggml_context   * ctx,
            struct ggml_tensor    * a,
@ -1994,26 +2046,44 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);

-
    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);

    // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
+
+    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+
+    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
@ -2402,6 +2472,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_riscv_v    (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);