move BLAS to a separate backend

2024-03-20 18:45:05 +01:00 · 2024-03-20 18:45:05 +01:00 · 7f58793c56
commit 7f58793c56
parent b90dc566c1
16 changed files with 590 additions and 339 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -311,9 +311,9 @@ if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+    #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
-        set(BLA_SIZEOF_INTEGER 8)
+    #    set(BLA_SIZEOF_INTEGER 8)
-    endif()
+    #endif()
    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
    find_package(BLAS)
@ -380,6 +380,9 @@ if (LLAMA_BLAS)
            add_compile_definitions(GGML_BLAS_USE_MKL)
        endif()
        set(GGML_HEADERS_BLAS ggml-blas.h)
        set(GGML_SOURCES_BLAS ggml-blas.c)
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
    else()
@ -1255,6 +1258,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            )
--- a/23
+++ b/23
@ -408,6 +408,7 @@ ifndef LLAMA_NO_ACCELERATE
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
 		OBJS        += ggml-blas.o
 	endif
 endif # LLAMA_NO_ACCELERATE
@ -421,23 +422,35 @@ ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 	OBJS        += ggml-blas.o
 endif # LLAMA_OPENBLAS
 ifdef LLAMA_OPENBLAS64
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas64)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
 	OBJS        += ggml-blas.o
 endif # LLAMA_OPENBLAS64
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 	OBJS        += ggml-blas.o
 endif # LLAMA_BLIS
 ifndef LLAMA_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
 	OBJS        += sgemm.o
 endif
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 ifdef LLAMA_RPC
 	MK_CPPFLAGS   += -DGGML_USE_RPC
 	OBJS          += ggml-rpc.o
 endif # LLAMA_RPC
 ggml-blas.o: ggml-blas.c ggml-blas.h
 	$(CC) $(CFLAGS) -c $< -o $@
 ifdef LLAMA_CUBLAS
 # LLAMA_CUBLAS is deprecated and will be removed in the future
 	LLAMA_CUDA := 1
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -339,6 +339,7 @@ struct hash_node {
 };
 struct tensor_alloc {
    int buffer_id;
    size_t offset;
    size_t size_max; // 0 = pre-allocated, unused, or view
 };
@ -349,7 +350,6 @@ struct leaf_alloc {
 };
 struct node_alloc {
    int buffer_id;
    struct tensor_alloc dst;
    struct tensor_alloc src[GGML_MAX_SRC];
 };
@ -511,17 +511,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
    }
 }
-static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
    // graph outputs are never freed
    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
        AT_PRINTF("not freeing output %s\n", node->name);
        return;
    }
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
    size_t offset = hn->offset;
    int buffer_id = hn->buffer_id;
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
    hn->allocated = false;
@ -626,11 +627,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
                    AT_PRINTF("view_src %s: %d children, %d views\n",
                        view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                    if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
-                        ggml_gallocr_free_node(galloc, view_src, buffer_id);
+                        ggml_gallocr_free_node(galloc, view_src);
                    }
                }
                else if (p_hn->allocated) {
-                    ggml_gallocr_free_node(galloc, parent, buffer_id);
+                    ggml_gallocr_free_node(galloc, parent);
                }
            }
            AT_PRINTF("\n");
@ -674,22 +675,26 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
-        node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
+        //node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
        if (node->view_src || node->data) {
            node_alloc->dst.buffer_id = -1;
            node_alloc->dst.offset = SIZE_MAX;
            node_alloc->dst.size_max = 0;
        } else {
            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-            node_alloc->dst.offset   = hn->offset;
+            node_alloc->dst.buffer_id = hn->buffer_id;
-            node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+            node_alloc->dst.offset    = hn->offset;
            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
        }
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (!src || src->view_src || src->data) {
                node_alloc->src[j].buffer_id = -1;
                node_alloc->src[j].offset = SIZE_MAX;
                node_alloc->src[j].size_max = 0;
            } else {
                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
                node_alloc->src[j].buffer_id = hn->buffer_id;
                node_alloc->src[j].offset   = hn->offset;
                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
            }
@ -706,9 +711,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
        if (leaf->view_src || leaf->data) {
            galloc->leaf_allocs[i].leaf.buffer_id = -1;
            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
            galloc->leaf_allocs[i].leaf.size_max = 0;
        } else {
            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
            galloc->leaf_allocs[i].leaf.offset = hn->offset;
            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
        }
@ -740,7 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
    int buffer_id = tensor_alloc->buffer_id;
    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
    if (tensor->view_src != NULL) {
@ -768,8 +776,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
    }
 }
-static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
+    ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
    return talloc->size_max >= node_size;
 }
@ -793,7 +801,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
        struct ggml_tensor * node = graph->nodes[i];
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
-        if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
+        if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
 #ifndef NDEBUG
            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
 #endif
@ -805,7 +813,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
            if (src == NULL) {
                continue;
            }
-            if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
+            if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
 #ifndef NDEBUG
                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
 #endif
@ -846,7 +854,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
    for (int i = 0; i < graph->n_leafs; i++) {
        struct ggml_tensor * leaf = graph->leafs[i];
        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
-        ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
+        ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
    }
    // nodes
    for (int i = 0; i < graph->n_nodes; i++) {
@ -857,9 +865,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
            if (src == NULL) {
                continue;
            }
-            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
+            ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
        }
-        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
+        ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
    }
    return true;
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -17,13 +17,15 @@ extern "C" {
    struct ggml_backend_buffer_type_i {
        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
        // allocate a buffer of this type
        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        // tensor alignment
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft);
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        // max buffer size that can be allocated
-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft);
        // data size needed to allocate the tensor, including padding
        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
        // check if tensor data is in host memory
        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
    };
@ -92,27 +94,37 @@ extern "C" {
        void (*GGML_CALL synchronize)(ggml_backend_t backend);
        // compute graph with a plan (not used currently)
        // create a new plan for a graph
        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
        void                      (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
        // compute the graph with the plan
        enum ggml_status          (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph with a plan
        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan (async)
        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-        // check if the backend supports an operation
+        // check if the backend can compute an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // check if the backend can use tensors allocated in a buffer type
        bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
        // these should be expensive operations with large batch sizes that may benefit from running on this backend
        // even if the weight has to be copied from the CPU temporarily
        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // (optional) event synchronization
        // create a new event that can record events on this backend instance
        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
        // record an event on the backend instance that created it
        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
        // wait for an event on on a different backend instance
        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
        // block until an event is recorded
        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
    };
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
    return ggml_nbytes(tensor);
 }
 bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return buft->iface.supports_backend(buft, backend);
 }
 bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
    if (buft->iface.is_host) {
        return buft->iface.is_host(buft);
@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
    return backend->iface.supports_op(backend, op);
 }
 bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    return backend->iface.supports_buft(backend, buft);
 }
 bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    if (backend->iface.offload_op != NULL) {
        return backend->iface.offload_op(backend, op);
@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
    GGML_UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_cpu(backend);
    GGML_UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return true;
@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
        },
        /* .context = */ NULL,
@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
        },
        /* .context  = */ NULL,
@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
    GGML_UNUSED(backend);
 }
 GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    return ggml_backend_buft_is_host(buft);
    GGML_UNUSED(backend);
 }
 static struct ggml_backend_i cpu_backend_i = {
    /* .get_name                = */ ggml_backend_cpu_name,
    /* .free                    = */ ggml_backend_cpu_free,
@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
    /* .synchronize             = */ NULL,
    /* .graph_plan_create       = */ ggml_backend_cpu_graph_plan_create,
    /* .graph_plan_free         = */ ggml_backend_cpu_graph_plan_free,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ ggml_backend_cpu_graph_plan_compute,
    /* .graph_compute           = */ ggml_backend_cpu_graph_compute,
    /* .supports_op             = */ ggml_backend_cpu_supports_op,
    /* .supports_buft           = */ ggml_backend_cpu_supports_buft,
    /* .offload_op              = */ NULL,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
    int * node_backend_ids; // [graph_size]
    int * leaf_backend_ids; // [graph_size]
    int * prev_node_backend_ids; // [graph_size]
    int * prev_leaf_backend_ids; // [graph_size]
    // copy of the graph with modified inputs
    struct ggml_cgraph * graph;
@ -1097,15 +1100,16 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
    return -1;
 }
-static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
+static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
    ggml_backend_buffer_t buffer = tensor->buffer;
    if (buffer == NULL) {
        return -1;
    }
-    // find highest prio backend that supports the buffer type
+    // find highest prio backend that supports the buffer type and the op
    for (int i = 0; i < sched->n_backends; i++) {
-        if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
+        if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
            ggml_backend_supports_op(sched->backends[i], op)) {
            return i;
        }
    }
@ -1126,12 +1130,17 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
 #define GET_CAUSE(node) ""
 #endif
 //#define DEBUG_PASS1
 //#define DEBUG_PASS2
 //#define DEBUG_PASS3
 //#define DEBUG_PASS4
 // returns the backend that should be used for the node based on the current locations
 static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
    // TODO: use supports_op to check if the backend supports the op
    // assign pre-allocated nodes to their backend
-    int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
+    int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
    if (cur_backend_id != -1) {
        SET_CAUSE(tensor, "1.dst");
        return cur_backend_id;
@ -1139,7 +1148,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
    // view_src
    if (tensor->view_src != NULL) {
-        cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
+        cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
        if (cur_backend_id != -1) {
            SET_CAUSE(tensor, "1.vsrc");
            return cur_backend_id;
@ -1161,7 +1170,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
            continue;
        }
        if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
-            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
+            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
            // check if a backend with higher prio wants to offload the op
            if (src_backend_id == sched->n_backends - 1) {
                for (int b = 0; b < src_backend_id; b++) {
@ -1223,10 +1232,43 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
    }
 }
-//#define DEBUG_PASS1
+static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
-//#define DEBUG_PASS2
+    if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
-//#define DEBUG_PASS3
+        *node_backend_id = cur_backend_id;
-//#define DEBUG_PASS4
+        SET_CAUSE(node, "2.2");
    } else {
        for (int b = 0; b < sched->n_backends; b++) {
            if (b == cur_backend_id) {
                continue;
            }
            if (ggml_backend_supports_op(sched->backends[b], node)) {
                *node_backend_id = b;
                cur_backend_id = b;
                SET_CAUSE(node, "2.2");
                break;
            }
        }
    }
    return cur_backend_id;
 }
 static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
    ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
    ggml_backend_buffer_type_t buft = NULL;
    if (buf) {
        // the tensor is already allocated
        buft = buf->buft;
    } else {
        // see if the tensor already has a backend assigned, and use the buffer type of that backend
        int tensor_backend_id = tensor_backend_id(t);
        if (tensor_backend_id != -1) {
            buft = sched->bufts[tensor_backend_id];
        }
    }
    return buft != NULL && ggml_backend_supports_buft(sched->backends[cur_backend_id], buft);
 }
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@ -1306,9 +1348,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                } else {
                    cur_backend_id = *node_backend_id;
                }
-            } else {
+            } else if (cur_backend_id != -1) {
-                *node_backend_id = cur_backend_id;
+                // FIXME: clean this
-                SET_CAUSE(node, "2.2");
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
                if (cur_backend_id == sched->n_backends - 1) {
                    // skip cpu (lowest prio backend)
                    cur_backend_id = -1;
                }
            }
        }
    }
@ -1328,9 +1374,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                } else {
                    cur_backend_id = *node_backend_id;
                }
-            } else {
+            } else if (cur_backend_id != -1) {
-                *node_backend_id = cur_backend_id;
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
-                SET_CAUSE(node, "2.1");
+                if (cur_backend_id == sched->n_backends - 1) {
                    // skip cpu (lowest prio backend)
                    cur_backend_id = -1;
                }
            }
        }
    }
@ -1345,9 +1394,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            int * node_backend_id = &tensor_backend_id(node);
            if (*node_backend_id != -1) {
                cur_backend_id = *node_backend_id;
-            } else {
+            } else if (cur_backend_id != -1) {
-                *node_backend_id = cur_backend_id;
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
                SET_CAUSE(node, "2.4");
            }
        }
    }
@ -1362,9 +1410,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            int * node_backend_id = &tensor_backend_id(node);
            if (*node_backend_id != -1) {
                cur_backend_id = *node_backend_id;
-            } else {
+            } else if (cur_backend_id != -1) {
-                *node_backend_id = cur_backend_id;
+                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
                SET_CAUSE(node, "2.3");
            }
        }
    }
@ -1448,10 +1495,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                        }
                    }
                    // check if the split has too many inputs
                    // FIXME: count the number of inputs instead of only checking when full
                    if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
                        const size_t id = hash_id(src);
                        int src_backend_id = sched->tensor_backend_id[id];
-                        if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
+                        bool supported = buffer_supported(sched, src, cur_backend_id);
                        if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
                            //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                            need_new_split = true;
                            break;
@ -1486,7 +1535,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                const int src_backend_id = tensor_backend_id(src);
                assert(src_backend_id != -1); // all inputs should be assigned by now
-                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1)  {
+                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
                    size_t id = hash_id(src);
                    if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
                        ggml_backend_t backend = sched->backends[src_backend_id];
@ -1511,7 +1560,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                    }
                }
-                if (src_backend_id != node_backend_id) {
+                bool supported = buffer_supported(sched, src, cur_backend_id);
                if (src_backend_id != cur_backend_id && !supported) {
                    // create a copy of the input in the split's backend
                    const size_t id = hash_id(src);
                    if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@ -1543,6 +1593,18 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
    // create copies of the graph for each split
    // TODO: avoid this copy
    // swap node_backend_ids and leaf_backend_ids and prevs
    {
        int * tmp = sched->node_backend_ids;
        sched->node_backend_ids = sched->prev_node_backend_ids;
        sched->prev_node_backend_ids = tmp;
        tmp = sched->leaf_backend_ids;
        sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
        sched->prev_leaf_backend_ids = tmp;
    }
    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
    for (int i = 0; i < sched->n_splits; i++) {
        struct ggml_backend_sched_split * split = &sched->splits[i];
@ -1613,8 +1675,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 }
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
    bool backend_ids_changed = false;
    for (int i = 0; i < sched->graph->n_nodes; i++) {
        if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
            backend_ids_changed = true;
            break;
        }
    }
    if (!backend_ids_changed) {
        for (int i = 0; i < sched->graph->n_leafs; i++) {
            if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
                backend_ids_changed = true;
                break;
            }
        }
    }
    // allocate graph
-    if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
        // the re-allocation may cause the split inputs to be moved to a different address
        ggml_backend_sched_synchronize(sched);
 #ifndef NDEBUG
@ -1735,6 +1813,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
    const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
    sched->node_backend_ids  = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
    sched->leaf_backend_ids  = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
    sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
    sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
    sched->n_backends = n_backends;
@ -1747,7 +1827,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
    for (int b = 0; b < n_backends; b++) {
        sched->backends[b] = backends[b];
        sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
-        GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
+        GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
        if (sched->n_copies > 1) {
            for (int c = 0; c < sched->n_copies; c++) {
                sched->events[b][c] = ggml_backend_event_new(backends[b]);
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -23,7 +23,6 @@ extern "C" {
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
    // buffer
@ -74,6 +73,7 @@ extern "C" {
    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
@ -90,7 +90,7 @@ extern "C" {
    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
-    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
+    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event);
    //
    // CPU backend
@ -119,7 +119,7 @@ extern "C" {
    GGML_API size_t                     ggml_backend_reg_get_count(void);
    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
--- a/ggml-blas.c
+++ b/ggml-blas.c
@ -0,0 +1,257 @@
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
 #include <stdlib.h>
 #if defined(GGML_USE_ACCELERATE)
 #   include <Accelerate/Accelerate.h>
 #elif defined(GGML_USE_OPENBLAS)
 #   if defined(GGML_BLAS_USE_MKL)
 #       include <mkl.h>
 #   else
 #       include <cblas.h>
 #   endif
 #endif
 struct ggml_backend_blas_context {
    int n_threads;
    void * work_data;
    size_t work_size;
 };
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(const struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    //const int64_t ne00 = src0->ne[0];
    //const int64_t ne01 = src0->ne[1];
    const int64_t ne10 = src1->ne[0];
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    // TODO: find the optimal values for these
    if (ggml_is_contiguous(src0) &&
        ggml_is_contiguous(src1) &&
        src1->type == GGML_TYPE_F32 &&
        ((src0->type == GGML_TYPE_F32) || (ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
        return true;
    }
    return false;
 }
 static void ggml_backend_blas_mul_mat(struct ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    GGML_TENSOR_BINARY_OP_LOCALS
    const enum ggml_type type = src0->type;
    ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);
    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
    const int64_t ne_plane      = ne01*ne00;
    const size_t  desired_wsize = type == GGML_TYPE_F32 ? 0 : ne13*ne12*ne_plane*sizeof(float);
    if (ctx->work_size < desired_wsize) {
        free(ctx->work_data);
        ctx->work_data = malloc(desired_wsize);
        GGML_ASSERT(ctx->work_data != NULL);
        ctx->work_size = desired_wsize;
    }
    void * wdata = ctx->work_data;
    // convert src0 to float
    if (true) {
        if (type != GGML_TYPE_F32) {
            ggml_to_float_t const to_float = type_traits.to_float;
            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
                    const void  *       x      = (char *)  src0->data + i02*nb02          + i03*nb03;
                          float * const wplane = (float *) wdata      + i03*ne12*ne_plane + i02*ne_plane;
 #ifdef GGML_USE_OPENMP
                #pragma omp parallel for num_threads(ctx->n_threads)
 #endif
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
                    }
                }
            }
        }
    }
    for (int64_t i13 = 0; i13 < ne13; i13++) {
        for (int64_t i12 = 0; i12 < ne12; i12++) {
            const int64_t i03 = i13/r3;
            const int64_t i02 = i12/r2;
            const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
            const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
                  float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
            if (type != GGML_TYPE_F32) {
                x = (float *) wdata + i03*ne12*ne_plane + i02*ne_plane;
            }
            cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne1, ne01, ne10,
                        1.0f,   y, ne10,
                                x, ne00,
                        0.0f,   d, ne01);
        }
    }
 }
 // backend interface
 GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
    return "BLAS";
    GGML_UNUSED(backend);
 }
 GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
    free(backend);
 }
 GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
    return ggml_backend_cpu_buffer_type();
    GGML_UNUSED(backend);
 }
 GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend->context;
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];
        switch (node->op) {
            case GGML_OP_MUL_MAT:
                ggml_backend_blas_mul_mat(ctx, node);
                break;
            // TODO
            //case GGML_OP_OUT_PROD:
            case GGML_OP_NONE:
            case GGML_OP_RESHAPE:
            case GGML_OP_VIEW:
            case GGML_OP_PERMUTE:
            case GGML_OP_TRANSPOSE:
                break;
            default:
                fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
                GGML_ASSERT(false);
        }
    }
    return GGML_STATUS_SUCCESS;
    GGML_UNUSED(backend);
 }
 GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    return op->op == GGML_OP_MUL_MAT && ggml_compute_forward_mul_mat_use_blas(op);
    GGML_UNUSED(backend);
 }
 GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    return ggml_backend_buft_is_host(buft);
    GGML_UNUSED(backend);
 }
 static struct ggml_backend_i blas_backend_i = {
    /* .get_name                = */ ggml_backend_blas_name,
    /* .free                    = */ ggml_backend_blas_free,
    /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
    /* .set_tensor_async        = */ NULL,
    /* .get_tensor_async        = */ NULL,
    /* .cpy_tensor_async        = */ NULL,
    /* .synchronize             = */ NULL,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_blas_graph_compute,
    /* .supports_op             = */ ggml_backend_blas_supports_op,
    /* .supports_buft           = */ ggml_backend_blas_supports_buft,
    /* .offload_op              = */ NULL,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
    /* .event_record            = */ NULL,
    /* .event_wait              = */ NULL,
    /* .event_synchronize       = */ NULL,
 };
 static ggml_guid_t ggml_backend_blas_guid(void) {
    static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
    return &guid;
 }
 ggml_backend_t ggml_backend_blas_init(void) {
    ggml_backend_t backend = malloc(sizeof(struct ggml_backend));
    if (backend == NULL) {
        return NULL;
    }
    struct ggml_backend_blas_context * ctx = malloc(sizeof(struct ggml_backend_blas_context));
    if (ctx == NULL) {
        return NULL;
    }
    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
    ctx->work_data           = NULL;
    ctx->work_size           = 0;
    *backend = (struct ggml_backend) {
        /* .guid      = */ ggml_backend_blas_guid(),
        /* .interface = */ blas_backend_i,
        /* .context   = */ ctx,
    };
    return backend;
 }
 GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
 }
 void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
    GGML_ASSERT(ggml_backend_is_blas(backend_blas));
    struct ggml_backend_blas_context * ctx = (struct ggml_backend_blas_context *)backend_blas->context;
    ctx->n_threads = n_threads;
 }
--- a/ggml-blas.h
+++ b/ggml-blas.h
@ -0,0 +1,22 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
 GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
 // number of threads used for conversion to float
 GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
    return ctx->name.c_str();
 }
 static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
 }
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
    GGML_UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    if (!ggml_backend_is_cuda(backend)) {
        return false;
    }
    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
    return buft_ctx->device == cuda_ctx->device;
 }
 static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
 };
@ -929,6 +921,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
    GGML_UNUSED(buft);
 }
 static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
 }
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
    // instead, we allocate them for each tensor separately in init_tensor
@ -972,12 +968,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
    return total_size;
 }
 GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_cuda(backend);
    GGML_UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return false;
@ -990,7 +980,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };
@ -1090,7 +1079,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
        /* .context  = */ nullptr,
@ -2919,6 +2907,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
    GGML_UNUSED(backend);
 }
 GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    if (ggml_backend_buft_is_cuda_split(buft)) {
        return true;
    }
    if (ggml_backend_buft_is_cuda(buft)) {
        ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
        ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
        return buft_ctx->device == cuda_ctx->device;
    }
    return false;
 }
 GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
    const int min_batch_size = 32;
@ -2991,9 +2993,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
    /* .synchronize             = */ ggml_backend_cuda_synchronize,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
    /* .supports_op             = */ ggml_backend_cuda_supports_op,
    /* .supports_buft           = */ ggml_backend_cuda_supports_buft,
    /* .offload_op              = */ ggml_backend_cuda_offload_op,
    /* .event_new               = */ ggml_backend_cuda_event_new,
    /* .event_free              = */ ggml_backend_cuda_event_free,
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@ -1899,18 +1899,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
    return ctx->max_alloc;
 }
 static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    GGML_UNUSED(buft);
    return ggml_backend_is_kompute(backend);
 }
 static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
    /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
    /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
    /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
    /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
 };
@ -1970,6 +1964,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
    return ggml_vk_supports_op(op);
 }
 static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    GGML_UNUSED(backend);
    return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
 }
 static struct ggml_backend_i kompute_backend_i = {
    /* .get_name                = */ ggml_backend_kompute_name,
    /* .free                    = */ ggml_backend_kompute_free,
@ -1980,9 +1979,11 @@ static struct ggml_backend_i kompute_backend_i = {
    /* .synchronize             = */ NULL,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
    /* .supports_op             = */ ggml_backend_kompute_supports_op,
    /* .supports_buft           = */ ggml_backend_kompute_supports_buft,
    /* .offload_op              = */ NULL,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -3040,12 +3040,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
    UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
    UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return true;
@ -3060,7 +3054,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
        },
        /* .context = */ NULL,
@ -3175,6 +3168,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
    return ggml_metal_supports_op(metal_ctx, op);
 }
 GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
    UNUSED(backend);
 }
 static struct ggml_backend_i ggml_backend_metal_i = {
    /* .get_name                = */ ggml_backend_metal_name,
    /* .free                    = */ ggml_backend_metal_free,
@ -3185,9 +3184,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
    /* .synchronize             = */ NULL,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_metal_graph_compute,
    /* .supports_op             = */ ggml_backend_metal_supports_op,
    /* .supports_buft           = */ ggml_backend_metal_supports_buft,
    /* .offload_op              = */ NULL,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
--- a/ggml-rpc.cpp
+++ b/ggml-rpc.cpp
@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
    return ggml_nbytes(tensor);
 }
 GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    if (!ggml_backend_is_rpc(backend)) {
        return false;
    }
    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
    return buft_ctx->endpoint == rpc_ctx->endpoint;
 }
 static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_rpc_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_rpc_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_rpc_buffer_type_get_alignment,
    /* .get_max_size     = */ ggml_backend_rpc_get_max_size,
    /* .get_alloc_size   = */ ggml_backend_rpc_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
 };
@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
    return false;
 }
 GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
        return false;
    }
    ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
    ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
    return buft_ctx->endpoint == rpc_ctx->endpoint;
 }
 static ggml_backend_i ggml_backend_rpc_interface = {
    /* .get_name                = */ ggml_backend_rpc_name,
    /* .free                    = */ ggml_backend_rpc_free,
@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
    /* .synchronize             = */ ggml_backend_rpc_synchronize,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_rpc_graph_compute,
    /* .supports_op             = */ ggml_backend_rpc_supports_op,
    /* .supports_buft           = */ ggml_backend_rpc_supports_buft,
    /* .offload_op              = */ NULL,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -16631,22 +16631,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
    UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    if (!ggml_backend_is_sycl(backend)) {
        return false;
    }
    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    return buft_ctx->device == sycl_ctx->device;
 }
 static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_sycl_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
    /* .get_max_size     = */ ggml_backend_sycl_buffer_type_get_max_size,
    /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
    /* .is_host          = */ nullptr,
 };
@ -16998,12 +16988,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
    return total_size;
 }
 GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_sycl(backend);
    UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return false;
@ -17016,7 +17000,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
    /* .get_alignment    = */ ggml_backend_sycl_split_buffer_type_get_alignment,
    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
    /* .is_host          = */ ggml_backend_sycl_split_buffer_type_is_host,
 };
@ -17102,7 +17085,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
            /* .get_max_size     = */ NULL, // TODO: return device.maxBufferLength
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
        /* .context  = */ nullptr,
@ -17367,6 +17349,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
    GGML_UNUSED(backend);
 }
 GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
        return false;
    }
    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
    return buft_ctx->device == sycl_ctx->device;
 }
 static ggml_backend_i ggml_backend_sycl_interface = {
    /* .get_name                = */ ggml_backend_sycl_name,
@ -17378,9 +17368,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
    /* .synchronize             = */ ggml_backend_sycl_synchronize,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
    /* .supports_op             = */ ggml_backend_sycl_supports_op,
    /* .supports_buft           = */ ggml_backend_sycl_supports_buft,
    /* .offload_op              = */ ggml_backend_sycl_offload_op,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -6110,24 +6110,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
    UNUSED(buft);
 }
 GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    if (!ggml_backend_is_vk(backend)) {
        return false;
    }
    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    return buft_ctx->ctx->idx == ctx->idx;
 }
 static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_vk_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_vk_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_vk_buffer_type_get_alignment,
    /* .get_max_size     = */ ggml_backend_vk_buffer_type_get_max_size,
    /* .get_alloc_size   = */ ggml_backend_vk_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
 };
@ -6203,7 +6191,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
            /* .get_alignment    = */ ggml_backend_vk_host_buffer_type_get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
        /* .context  = */ nullptr,
@ -6524,6 +6511,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
    UNUSED(backend);
 }
 GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
        return false;
    }
    ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    return buft_ctx->ctx->idx == ctx->idx;
 }
 // TODO: enable async and synchronize
 static ggml_backend_i ggml_backend_vk_interface = {
    /* .get_name                = */ ggml_backend_vk_name,
@ -6535,9 +6533,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
    /* .synchronize             = */ NULL,  // ggml_backend_vk_synchronize,
    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_update       = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_vk_graph_compute,
    /* .supports_op             = */ ggml_backend_vk_supports_op,
    /* .supports_buft           = */ ggml_backend_vk_supports_buft,
    /* .offload_op              = */ ggml_backend_vk_offload_op,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
--- a/ggml.c
+++ b/ggml.c
@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)
 #include <mkl.h>
 #else
 #include <cblas.h>
 #endif
 #endif
 // floating point type used to accumulate sums
@ -12216,39 +12210,6 @@ static void ggml_compute_forward_group_norm(
 // ggml_compute_forward_mul_mat
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
    //const int64_t ne00 = src0->ne[0];
    //const int64_t ne01 = src0->ne[1];
    const int64_t ne10 = src1->ne[0];
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
    //       all the experts for each batch element and the processing would become incredibly slow
    // TODO: find the optimal values for these
    if (dst->op != GGML_OP_MUL_MAT_ID &&
        ggml_is_contiguous(src0) &&
        ggml_is_contiguous(src1) &&
      //src0->type == GGML_TYPE_F32 &&
        src1->type == GGML_TYPE_F32 &&
        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
        return true;
    }
    return false;
 }
 #endif
 static void ggml_compute_forward_mul_mat_one_chunk(
    const struct ggml_compute_params * params,
    struct ggml_tensor * dst,
@ -12386,73 +12347,6 @@ static void ggml_compute_forward_mul_mat(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
        const int64_t ne_plane      = ne01*ne00;
        const size_t  desired_wsize = ne13*ne12*ne_plane*sizeof(float);
        UNUSED(desired_wsize);
        if (params->type == GGML_TASK_TYPE_INIT) {
            if (type != GGML_TYPE_F32) {
                assert(params->wsize >= desired_wsize);
                // parallelize by src0 rows
                for (int64_t i13 = 0; i13 < ne13; i13++) {
                    for (int64_t i12 = 0; i12 < ne12; i12++) {
                        // broadcast src0 into src1 across 2nd,3rd dimension
                        const int64_t i03 = i13/r3;
                        const int64_t i02 = i12/r2;
                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                              ggml_to_float_t  const to_float = type_traits[type].to_float;
                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
                        }
                    }
                }
            }
            return;
        }
        if (params->type == GGML_TASK_TYPE_FINALIZE) {
            return;
        }
        // perform sgemm, parallelization controlled by blas lib
        if (ith != 0) {
            return;
        }
        //const int64_t tgemm0 = ggml_perf_time_us();
        for (int64_t i13 = 0; i13 < ne13; i13++) {
            for (int64_t i12 = 0; i12 < ne12; i12++) {
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;
                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
                if (type != GGML_TYPE_F32) {
                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                }
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                          ne1, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
            }
        }
        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
        return;
    }
 #endif
 #if GGML_USE_LLAMAFILE
    const bool src1_cont = ggml_is_contiguous(src1);
@ -12833,19 +12727,7 @@ static void ggml_compute_forward_out_prod_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    bool use_blas = ggml_is_matrix(src0) &&
        ggml_is_matrix(src1) &&
        ggml_is_contiguous(src0) &&
        (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
 #endif
    if (params->type == GGML_TASK_TYPE_INIT) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
        if (use_blas) {
            return;
        }
 #endif
        if (ith != 0) {
            return;
        }
@ -12857,50 +12739,6 @@ static void ggml_compute_forward_out_prod_f32(
        return;
    }
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (use_blas) {
        if (params->ith != 0) { // All threads other than the first do no work.
            return;
        }
        // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
        // src0: (k,n)
        // src1: (k,m)
        // dst:  (m,n)
        //
        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
        // Also expressed as (major,minor)
        // a: (m,k): so src1 transposed
        // b: (k,n): so src0
        // c: (m,n)
        //
        // However, if ggml_is_transposed(src1) is true, then
        // src1->data already contains a transposed version, so sgemm mustn't
        // transpose it further.
        int n = src0->ne[0];
        int k = src0->ne[1];
        int m = src1->ne[0];
        int transposeA, lda;
        if (!ggml_is_transposed(src1)) {
            transposeA = CblasTrans;
            lda = m;
        } else {
            transposeA = CblasNoTrans;
            lda = k;
        }
        float * a = (float *) ((char *) src1->data);
        float * b = (float *) ((char *) src0->data);
        float * c = (float *) ((char *) dst->data);
        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
        return;
    }
 #endif
    // dst[:,:,:,:] = 0
    // for i2,i3:
    //   for i1:
@ -13030,8 +12868,6 @@ static void ggml_compute_forward_out_prod_q_f32(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
@ -13428,6 +13264,8 @@ static void ggml_compute_forward_get_rows_q(
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
        assert(i01 >= 0 && i01 < ne01);
        dequantize_row_q(
                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
@ -13471,6 +13309,8 @@ static void ggml_compute_forward_get_rows_f16(
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
        assert(i01 >= 0 && i01 < ne01);
        ggml_fp16_to_fp32_row(
                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
@ -13514,7 +13354,9 @@ static void ggml_compute_forward_get_rows_bf16(
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
-       ggml_bf16_to_fp32_row(
+        assert(i01 >= 0 && i01 < ne01);
        ggml_bf16_to_fp32_row(
                (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                     (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
    }
@ -13557,6 +13399,8 @@ static void ggml_compute_forward_get_rows_f32(
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
        assert(i01 >= 0 && i01 < ne01);
        ggml_vec_cpy_f32(nc,
                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
                (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@ -19504,17 +19348,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                {
                    const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
                            // here we need memory for fully dequantized matrix from src0
                            // take into account that src0 can be broadcasted into src1[2,3]
                            cur = ggml_type_size(GGML_TYPE_F32)
                                * node->src[0]->ne[0]*node->src[0]->ne[1]
                                * node->src[1]->ne[2]*node->src[1]->ne[3];
                        }
                    } else
 #endif
                    if (node->src[1]->type != vec_dot_type) {
                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
                    }
--- a/llama.cpp
+++ b/llama.cpp
@ -21,6 +21,10 @@
 #   include "ggml-kompute.h"
 #endif
 #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
 #  include "ggml-blas.h"
 #endif
 #ifdef GGML_USE_METAL
 #  include "ggml-metal.h"
 #endif
@ -2298,9 +2302,13 @@ struct llama_context {
    std::vector<ggml_backend_t> backends;
 #ifdef GGML_USE_METAL
    ggml_backend_t backend_metal = nullptr;
 #endif
 #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
    ggml_backend_t backend_blas = nullptr;
 #endif
    ggml_backend_t backend_cpu = nullptr;
    const llama_model & model;
    // key + value cache for the self attention
@ -11516,17 +11524,17 @@ static struct ggml_cgraph * llama_build_graph(
        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
-        const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
+        //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
-        if (batch.n_tokens < 32 || full_offload) {
+        //if (batch.n_tokens < 32 || full_offload) {
-            if (il != -1 && strcmp(name, "norm") == 0) {
+        //    if (il != -1 && strcmp(name, "norm") == 0) {
-                for (auto * backend : lctx.backends) {
+        //        for (auto * backend : lctx.backends) {
-                    if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
+        //            if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
-                        ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+        //                ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
-                        break;
+        //                break;
-                    }
+        //            }
-                }
+        //        }
-            }
+        //    }
-        }
+        //}
    };
    struct ggml_cgraph * result = NULL;
@ -12017,6 +12025,11 @@ static void llama_graph_compute(
        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
        ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
    }
 #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
    if (lctx.backend_blas != nullptr) {
        ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
    }
 #endif
    ggml_backend_sched_graph_compute_async(lctx.sched, gf);
@ -12246,9 +12259,9 @@ static int llama_decode_internal(
        //       with the BLAS calls. need a better solution
        // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
        //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-        if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+        //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-            n_threads = std::min(4, n_threads);
+        //    n_threads = std::min(4, n_threads);
-        }
+        //}
        ggml_backend_sched_alloc_graph(lctx.sched, gf);
@ -16226,6 +16239,16 @@ struct llama_context * llama_new_context_with_model(
            ctx->backends.push_back(backend);
        }
 #endif
 #if defined(GGML_USE_OPENBLAS) || defined(GGML_USE_ACCELERATE)
        ctx->backend_blas = ggml_backend_blas_init();
        if (ctx->backend_blas == nullptr) {
            LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
        } else {
            ctx->backends.push_back(ctx->backend_blas);
        }
 #endif
 #if defined(GGML_USE_RPC)
        if (model->n_gpu_layers > 0) {
            for (const auto & endpoint : model->rpc_servers) {