sched : print assignments when GGML_SCHED_DEBUG env variable is set

2024-06-07 20:00:38 +02:00 · 2024-06-07 20:00:38 +02:00 · ecb75b5f54
commit ecb75b5f54
parent a8a1bf7981
3 changed files with 30 additions and 50 deletions
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1078,6 +1078,8 @@ struct ggml_backend_sched {
    ggml_backend_sched_eval_callback callback_eval;
    void * callback_eval_user_data;

+    bool debug;
+
    // align context_buffer to GGML_MEM_ALIGN
 #ifdef _MSC_VER
    __declspec(align(GGML_MEM_ALIGN))
@ -1130,11 +1132,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED
 #define GET_CAUSE(node) ""
 #endif

-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
-
 // returns the backend that should be used for the node based on the current locations
 static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
    // TODO: use supports_op to check if the backend supports the op
@ -1232,7 +1229,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
    }
 }

-static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+static int ggml_backend_sched_set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
    if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
        *node_backend_id = cur_backend_id;
        SET_CAUSE(node, "2.2");
@ -1252,7 +1249,7 @@ static int set_if_supports(ggml_backend_sched_t sched, struct ggml_tensor * node
    return cur_backend_id;
 }

-static bool buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
+static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int cur_backend_id) {
    ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
    ggml_backend_buffer_type_t buft = NULL;

@ -1322,9 +1319,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            }
        }
    }
-#ifdef DEBUG_PASS1
-    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif

    // pass 2: expand current backend assignments
    // assign the same backend to adjacent nodes
@ -1350,7 +1344,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                }
            } else if (cur_backend_id != -1) {
                // FIXME: clean this
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
                if (cur_backend_id == sched->n_backends - 1) {
                    // skip cpu (lowest prio backend)
                    cur_backend_id = -1;
@ -1375,7 +1369,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                    cur_backend_id = *node_backend_id;
                }
            } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
                if (cur_backend_id == sched->n_backends - 1) {
                    // skip cpu (lowest prio backend)
                    cur_backend_id = -1;
@ -1395,7 +1389,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            if (*node_backend_id != -1) {
                cur_backend_id = *node_backend_id;
            } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
            }
        }
    }
@ -1411,15 +1405,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            if (*node_backend_id != -1) {
                cur_backend_id = *node_backend_id;
            } else if (cur_backend_id != -1) {
-                cur_backend_id = set_if_supports(sched, node, cur_backend_id, node_backend_id);
+                cur_backend_id = ggml_backend_sched_set_if_supports(sched, node, cur_backend_id, node_backend_id);
            }
        }
    }

-#ifdef DEBUG_PASS2
-    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
-
    // pass 3: assign backends to remaining src from dst and view_src
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
@ -1446,9 +1436,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            }
        }
    }
-#ifdef DEBUG_PASS3
-    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif

    // pass 4: split graph, find tensors that need to be copied
    {
@ -1499,7 +1486,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                    if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
                        const size_t id = hash_id(src);
                        int src_backend_id = sched->tensor_backend_id[id];
-                        bool supported = buffer_supported(sched, src, cur_backend_id);
+                        bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
                        if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
                            //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                            need_new_split = true;
@ -1560,7 +1547,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                    }
                }

-                bool supported = buffer_supported(sched, src, cur_backend_id);
+                bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
                if (src_backend_id != cur_backend_id && !supported) {
                    // create a copy of the input in the split's backend
                    const size_t id = hash_id(src);
@ -1587,12 +1574,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
        split->i_end = graph->n_nodes;
        sched->n_splits = i_split + 1;
    }
-#ifdef DEBUG_PASS4
-    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif

-    // create copies of the graph for each split
-    // TODO: avoid this copy
+    if (sched->debug) {
+        ggml_backend_sched_print_assignments(sched, graph);
+    }

    // swap node_backend_ids and leaf_backend_ids and prevs
    {
@ -1605,6 +1590,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
        sched->prev_leaf_backend_ids = tmp;
    }

+    // create copies of the graph for each split
+    // TODO: avoid this copy
    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
    for (int i = 0; i < sched->n_splits; i++) {
        struct ggml_backend_sched_split * split = &sched->splits[i];
@ -1805,6 +1792,8 @@ ggml_backend_sched_t ggml_backend_sched_new(

    struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));

+    sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+
    // initialize hash table
    sched->hash_set          = ggml_hash_set_new(graph_size);
    sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
--- a/ggml-blas.h
+++ b/ggml-blas.h
@ -14,6 +14,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
 GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);

 // number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
 GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);


--- a/llama.cpp
+++ b/llama.cpp
@ -11533,17 +11533,18 @@ static struct ggml_cgraph * llama_build_graph(

        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
-        //const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
-        //if (batch.n_tokens < 32 || full_offload) {
-        //    if (il != -1 && strcmp(name, "norm") == 0) {
-        //        for (auto * backend : lctx.backends) {
-        //            if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
-        //                ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
-        //                break;
-        //            }
-        //        }
-        //    }
-        //}
+        const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
+        if (batch.n_tokens < 32 || full_offload) {
+            if (il != -1 && strcmp(name, "norm") == 0) {
+                for (auto * backend : lctx.backends) {
+                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
+                        (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
+                        ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
+                        break;
+                    }
+                }
+            }
+        }
    };

    struct ggml_cgraph * result = NULL;
@ -12261,17 +12262,6 @@ static int llama_decode_internal(
        }
        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);

-        // for big prompts, if BLAS is enabled, it is better to use only one thread
-        // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-        // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
-        //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
-        //       with the BLAS calls. need a better solution
-        // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
-        //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
-        //if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
-        //    n_threads = std::min(4, n_threads);
-        //}
-
        ggml_backend_sched_alloc_graph(lctx.sched, gf);

        llama_set_inputs(lctx, u_batch);