diff --git a/CMakeLists.txt b/CMakeLists.txt index d8f7780f7..5576c26e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,8 +39,12 @@ endif() if (APPLE) set(LLAMA_METAL_DEFAULT ON) + set(LLAMA_BLAS_DEFAULT ON) + set(LLAMA_BLAS_VENDOR_DEFAULT "Apple") else() set(LLAMA_METAL_DEFAULT OFF) + set(LLAMA_BLAS_DEFAULT OFF) + set(LLAMA_BLAS_VENDOR_DEFAULT "Generic") endif() set(LLAMA_LLAMAFILE_DEFAULT ON) @@ -91,8 +95,9 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" OFF) -set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") +option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT}) +set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING + "llama: BLAS library vendor") option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) @@ -321,7 +326,7 @@ if (LLAMA_BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - if ("${BLAS_INCLUDE_DIRS}" STREQUAL "") + if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple")) # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 find_package(PkgConfig REQUIRED) diff --git a/Makefile b/Makefile index e57fcd9ad..adc9fa434 100644 --- a/Makefile +++ b/Makefile @@ -404,7 +404,7 @@ ifndef LLAMA_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate diff --git a/ggml-alloc.c b/ggml-alloc.c index 8973ef813..0048e5c92 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -706,7 +706,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - //node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); if (node->view_src || node->data) { node_alloc->dst.buffer_id = -1; node_alloc->dst.offset = SIZE_MAX; diff --git a/ggml-blas.cpp b/ggml-blas.cpp index 2537a4a0f..608ead190 100644 --- a/ggml-blas.cpp +++ b/ggml-blas.cpp @@ -16,6 +16,7 @@ struct ggml_backend_blas_context { int n_threads; char * work_data; size_t work_size; + std::vector> tasks; }; // helper function to determine if it is better to use BLAS or not @@ -33,7 +34,7 @@ static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->type == GGML_TYPE_F32 && - ((src0->type == GGML_TYPE_F32) || (ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ return true; @@ -83,7 +84,6 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { - std::vector> tasks; ggml_to_float_t const to_float = type_traits.to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -98,7 +98,7 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg } #else for (int i = 0; i < ctx->n_threads; i++) { - tasks.push_back(std::async(std::launch::async, [=]() { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { const int64_t start = i*ne01/ctx->n_threads; const int64_t end = (i + 1)*ne01/ctx->n_threads; for (int64_t i01 = start; i01 < end; i01++) { @@ -109,10 +109,14 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg #endif } } + +#ifndef GGML_USE_OPENMP // wait for all tasks to finish - for (auto & task : tasks) { + for (auto & task : ctx->tasks) { task.get(); } + ctx->tasks.clear(); +#endif } for (int64_t i13 = 0; i13 < ne13; i13++) { diff --git a/ggml.c b/ggml.c index e4ef34f25..01589c10e 100644 --- a/ggml.c +++ b/ggml.c @@ -22645,7 +22645,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) return 1; #else return 0; diff --git a/llama.cpp b/llama.cpp index 57d007f33..ec087a0a3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,7 +21,7 @@ # include "ggml-kompute.h" #endif -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS # include "ggml-blas.h" #endif @@ -2303,7 +2303,7 @@ struct llama_context { #ifdef GGML_USE_METAL ggml_backend_t backend_metal = nullptr; #endif -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS ggml_backend_t backend_blas = nullptr; #endif ggml_backend_t backend_cpu = nullptr; @@ -12025,7 +12025,7 @@ static void llama_graph_compute( ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS if (lctx.backend_blas != nullptr) { ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads); } @@ -16240,7 +16240,7 @@ struct llama_context * llama_new_context_with_model( } #endif -#if defined(GGML_USE_BLAS) || defined(GGML_USE_ACCELERATE) +#ifdef GGML_USE_BLAS ctx->backend_blas = ggml_backend_blas_init(); if (ctx->backend_blas == nullptr) { LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);