move BLAS to a separate backend (#6210)

* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-06-13 03:11:35 +02:00 · 2024-06-13 03:11:35 +02:00 · f578b86b21
commit f578b86b21
parent 1c641e6aac
17 changed files with 821 additions and 379 deletions
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -17,13 +17,15 @@ extern "C" {

    struct ggml_backend_buffer_type_i {
        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
+        // allocate a buffer of this type
        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        // tensor alignment
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft);
+        // max buffer size that can be allocated
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft);
+        // data size needed to allocate the tensor, including padding
+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
        // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
    };

@ -92,27 +94,37 @@ extern "C" {
        void (*GGML_CALL synchronize)(ggml_backend_t backend);

        // compute graph with a plan (not used currently)
+        // create a new plan for a graph
        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
+        void                      (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
+        // compute the graph with the plan
+        enum ggml_status          (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

-        // compute graph with a plan
-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan (async)
        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);

-        // check if the backend supports an operation
+        // check if the backend can compute an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

+        // check if the backend can use tensors allocated in a buffer type
+        bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+
        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
        // these should be expensive operations with large batch sizes that may benefit from running on this backend
        // even if the weight has to be copied from the CPU temporarily
        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);

        // (optional) event synchronization
+        // create a new event that can record events on this backend instance
        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
+        // record an event on the backend instance that created it
        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
+        // wait for an event on on a different backend instance
        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
+        // block until an event is recorded
        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
    };