diff --git a/ggml-backend.c b/ggml-backend.c
index 2cc9e09a8..80e129cf8 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1078,6 +1078,8 @@ struct ggml_backend_sched {
     ggml_backend_sched_eval_callback callback_eval;
     void * callback_eval_user_data;
 
+    bool debug;
+
     // align context_buffer to GGML_MEM_ALIGN
 #ifdef _MSC_VER
     __declspec(align(GGML_MEM_ALIGN))
@@ -1130,11 +1132,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
 #define GET_CAUSE(node) ""
 #endif
 
-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
-
 // returns the backend that should be used for the node based on the current locations
 static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
     // TODO: use supports_op to check if the backend supports the op
@@ -1232,7 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
     }
 }
 
-static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
     if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
         *node_backend_id = cur_backend_id;
         SET_CAUSE(node, "2.2");
@@ -1252,7 +1249,7 @@ static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node
     return cur_backend_id;
 }
 
-static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
+static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
     ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
     ggml_backend_buffer_type_t buft = NULL;
 
@@ -1322,9 +1319,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             }
         }
     }
-#ifdef DEBUG_PASS1
-    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
 
     // pass 2: expand current backend assignments
     // assign the same backend to adjacent nodes
@@ -1350,7 +1344,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 }
             } else if (cur_backend_id != -1) {
                 // FIXME: clean this
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
                 if (cur_backend_id == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
                     cur_backend_id = -1;
@@ -1375,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     cur_backend_id = *node_backend_id;
                 }
             } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
                 if (cur_backend_id == sched->n_backends - 1) {
                     // skip cpu (lowest prio backend)
                     cur_backend_id = -1;
@@ -1395,7 +1389,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
             } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
@@ -1411,15 +1405,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             if (*node_backend_id != -1) {
                 cur_backend_id = *node_backend_id;
             } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
             }
         }
     }
 
-#ifdef DEBUG_PASS2
-    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
-
     // pass 3: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -1446,9 +1436,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             }
         }
     }
-#ifdef DEBUG_PASS3
-    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
 
     // pass 4: split graph, find tensors that need to be copied
     {
@@ -1499,7 +1486,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
                         const size_t id = hash_id(src);
                         int src_backend_id = sched->tensor_backend_id[id];
-                        bool supported = buffer_supported(sched, src, cur_backend_id);
+                        bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
                         if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
                             //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                             need_new_split = true;
@@ -1560,7 +1547,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     }
                 }
 
-                bool supported = buffer_supported(sched, src, cur_backend_id);
+                bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
                 if (src_backend_id != cur_backend_id && !supported) {
                     // create a copy of the input in the split's backend
                     const size_t id = hash_id(src);
@@ -1587,12 +1574,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         split->i_end = graph->n_nodes;
         sched->n_splits = i_split + 1;
     }
-#ifdef DEBUG_PASS4
-    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
 
-    // create copies of the graph for each split
-    // TODO: avoid this copy
+    if (sched->debug) {
+        ggml_backend_sched_print_assignments(sched, graph);
+    }
 
     // swap node_backend_ids and leaf_backend_ids and prevs
     {
@@ -1605,6 +1590,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         sched->prev_leaf_backend_ids = tmp;
     }
 
+    // create copies of the graph for each split
+    // TODO: avoid this copy
     struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
     for (int i = 0; i < sched->n_splits; i++) {
         struct ggml_backend_sched_split * split = &sched->splits[i];
@@ -1805,6 +1792,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
     struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
 
+    sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+
     // initialize hash table
     sched->hash_set          = ggml_hash_set_new(graph_size);
     sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
diff --git a/ggml-blas.h b/ggml-blas.h
index 646ca84ef..f2e37de06 100644
--- a/ggml-blas.h
+++ b/ggml-blas.h
@@ -14,6 +14,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
 GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
 
 // number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
 GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 
 
diff --git a/llama.cpp b/llama.cpp
index 7e76c022b..225ea977f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11533,17 +11533,18 @@ static struct ggml_cgraph * llama_build_graph(
 
         // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
         // FIXME: fix in ggml_backend_sched
-        //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
-        //if (batch.n_tokens < 32 || full_offload) {
-        //    if (il != -1 && strcmp(name, "norm") == 0) {
-        //        for (auto * backend : lctx.backends) {
-        //            if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
-        //                ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
-        //                break;
-        //            }
-        //        }
-        //    }
-        //}
+        const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
+        if (batch.n_tokens < 32 || full_offload) {
+            if (il != -1 && strcmp(name, "norm") == 0) {
+                for (auto * backend : lctx.backends) {
+                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
+                        (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
+                        ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+                        break;
+                    }
+                }
+            }
+        }
     };
 
     struct ggml_cgraph * result = NULL;
@@ -12261,17 +12262,6 @@ static int llama_decode_internal(
         }
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
-        // for big prompts, if BLAS is enabled, it is better to use only one thread
-        // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-        // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-        //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-        //       with the BLAS calls. need a better solution
-        // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
-        //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-        //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-        //    n_threads = std::min(4, n_threads);
-        //}
-
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
 
         llama_set_inputs(lctx, u_batch);