ggml: try to issue operations concurrently on GPU

This commit add a ggml_graph_find_concurrency function to find if some operations can be issued simultaneously by GPU. Before sending a graph to the GPU backend we can call the new function to find concurrency in the graph. This will sort all the nodes and insert memory barrier nodes if necessary. one can simply dismiss the barrier nodes and issue operations sequentially, or try to concuurrently issue all the operations between two barriers.
2023-07-21 11:23:18 -04:00 · 2023-07-21 11:23:18 -04:00 · 1c3030ee41
commit 1c3030ee41
parent c8e6ef1846
2 changed files with 82 additions and 3 deletions
--- a/ggml.c
+++ b/ggml.c
@ -3807,9 +3807,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {

    "CROSS_ENTROPY_LOSS",
    "CROSS_ENTROPY_LOSS_BACK",
+    "BARRIER",
 };

-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 69, "GGML_OP_COUNT != 69");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -3887,9 +3888,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {

    "cross_entropy_loss(x,y)",
    "cross_entropy_loss_back(x,y)",
+    "memory barrier",
 };

-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 69, "GGML_OP_COUNT != 69");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@ -15164,6 +15166,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                // nop
            } break;
+        case GGML_OP_BARRIER:
+            {
+                // nop
+            } break;
        case GGML_OP_COUNT:
            {
                GGML_ASSERT(false);
@ -15999,6 +16005,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                // nop
            } break;
+        case GGML_OP_BARRIER:
+            {
+                // nop
+            } break;
        case GGML_OP_COUNT:
            {
                GGML_ASSERT(false);
@ -16077,6 +16087,66 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
    }
 }

+void ggml_graph_find_concurrency(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
+    int search_depth = 40; //we only find concurrency in this range to avoiding waste to much time
+    struct ggml_tensor * nodes_bak[GGML_MAX_NODES]={NULL};
+    struct ggml_tensor * barrier_node;
+    barrier_node = ggml_new_tensor_1d(ctx,GGML_TYPE_F32,0);
+    barrier_node->op=GGML_OP_BARRIER;
+
+    for (int i=0; i < cgraph->n_nodes; i++) {
+        nodes_bak[i] = cgraph->nodes[i];
+        cgraph->nodes[i] = NULL;
+    }
+
+    int n_left = cgraph->n_nodes;
+    int n_start = 0; // all nodes before n_start at nodes_bak array have been sorted and store back to cgraph->nodes
+    int level_pos = 0;  // at cgraph->nodes, the last layer (level) ends at level_pos
+    while (n_left > 0) {
+        // number of nodes at a layer (that can be issued concurrently)
+        int concurrency = 0;
+        for (int i = n_start; i < n_start + search_depth; i++) {
+            if (nodes_bak[i]) {
+
+                // if the requirements for nodes_bak[i] are satisfied
+                int exe_flag=1;
+                // scan all srcs
+                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
+                    struct ggml_tensor * src_cur = nodes_bak[i]->src[src_ind];
+                    if (src_cur) {
+                        // if is leaf nodes it's satisfied.
+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
+                        // otherwise if this src is the output from previous nodes.
+
+                        int is_found = 0;
+                        // scan 2*search_depth back because we insert barrier nodes.
+                        for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+                            if (cgraph->nodes[j] == src_cur) {is_found = 1; break;}
+                        }
+                        if (is_found == 0) {exe_flag = 0; break;}
+                    }
+                }
+                if (exe_flag) {
+                    cgraph->nodes[level_pos + concurrency] = nodes_bak[i];
+                    nodes_bak[i] = NULL;
+                    concurrency++;
+                }
+            }
+        }
+        n_left -= concurrency;
+        // adding a barrier between different layer
+        cgraph->nodes[level_pos + concurrency] = barrier_node;
+        cgraph->n_nodes++;
+        // jump all sorted nodes at nodes_bak
+        while (!nodes_bak[n_start]) {n_start++;}
+        level_pos += concurrency + 1;
+    }
+    //remove the last barrier after result_output
+    cgraph->nodes[cgraph->n_nodes-1] = NULL;
+    cgraph->n_nodes--;
+
+}
+
 void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
    ggml_build_forward_impl(cgraph, tensor, true);
 }
@ -16721,6 +16791,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                {
                    n_tasks = 1;
                } break;
+            case GGML_OP_BARRIER:
+                {
+                    // nop
+                } break;
            case GGML_OP_COUNT:
                {
                    GGML_ASSERT(false);
--- a/ggml.h
+++ b/ggml.h
@ -194,7 +194,7 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this

 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
+#define GGML_MAX_NODES         8192
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
@ -387,6 +387,8 @@ extern "C" {
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,

+        GGML_OP_BARRIER, // Any operation between two barriers can be issued concurrently.
+        
        GGML_OP_COUNT,
    };

@ -1363,6 +1365,9 @@ extern "C" {
    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);

+    //sort all nodes in a graph to find operations that can be issued concurrently, insert memory barrier if necessary
+    GGML_API void ggml_graph_find_concurrency(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);