move BLAS to a separate backend (#6210)

* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-06-13 03:11:35 +02:00 · 2024-06-13 03:11:35 +02:00 · f578b86b21
commit f578b86b21
parent 1c641e6aac
17 changed files with 821 additions and 379 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -21,6 +21,10 @@
 #   include "ggml-kompute.h"
 #endif

+#ifdef GGML_USE_BLAS
+#  include "ggml-blas.h"
+#endif
+
 #ifdef GGML_USE_METAL
 #  include "ggml-metal.h"
 #endif
@ -2299,9 +2303,13 @@ struct llama_context {
    std::vector<ggml_backend_t> backends;
 #ifdef GGML_USE_METAL
    ggml_backend_t backend_metal = nullptr;
+#endif
+#ifdef GGML_USE_BLAS
+    ggml_backend_t backend_blas = nullptr;
 #endif
    ggml_backend_t backend_cpu = nullptr;

+
    const llama_model & model;

    // key + value cache for the self attention
@ -11529,7 +11537,8 @@ static struct ggml_cgraph * llama_build_graph(
        if (batch.n_tokens < 32 || full_offload) {
            if (il != -1 && strcmp(name, "norm") == 0) {
                for (auto * backend : lctx.backends) {
-                    if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
+                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
+                        (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
                        ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
                        break;
                    }
@ -12026,6 +12035,11 @@ static void llama_graph_compute(
        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
        ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
    }
+#ifdef GGML_USE_BLAS
+    if (lctx.backend_blas != nullptr) {
+        ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
+    }
+#endif

    ggml_backend_sched_graph_compute_async(lctx.sched, gf);

@ -12248,17 +12262,6 @@ static int llama_decode_internal(
        }
        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);

-        // for big prompts, if BLAS is enabled, it is better to use only one thread
-        // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-        // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-        //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-        //       with the BLAS calls. need a better solution
-        // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
-        //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-        if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-            n_threads = std::min(4, n_threads);
-        }
-
        ggml_backend_sched_alloc_graph(lctx.sched, gf);

        llama_set_inputs(lctx, u_batch);
@ -16251,6 +16254,16 @@ struct llama_context * llama_new_context_with_model(
            ctx->backends.push_back(backend);
        }
 #endif
+
+#ifdef GGML_USE_BLAS
+        ctx->backend_blas = ggml_backend_blas_init();
+        if (ctx->backend_blas == nullptr) {
+            LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
+        } else {
+            ctx->backends.push_back(ctx->backend_blas);
+        }
+#endif
+
 #if defined(GGML_USE_RPC)
        if (model->n_gpu_layers > 0) {
            for (const auto & endpoint : model->rpc_servers) {