diff --git a/ggml.c b/ggml.c
index 6055da867..c0c847074 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3807,9 +3807,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
     "CROSS_ENTROPY_LOSS",
     "CROSS_ENTROPY_LOSS_BACK",
+    "BARRIER",
 };
 
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 69, "GGML_OP_COUNT != 69");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3887,9 +3888,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
     "cross_entropy_loss(x,y)",
     "cross_entropy_loss_back(x,y)",
+    "memory barrier",
 };
 
-static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
+static_assert(GGML_OP_COUNT == 69, "GGML_OP_COUNT != 69");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -15164,6 +15166,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 // nop
             } break;
+        case GGML_OP_BARRIER:
+            {
+                // nop
+            } break;
         case GGML_OP_COUNT:
             {
                 GGML_ASSERT(false);
@@ -15999,6 +16005,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // nop
             } break;
+        case GGML_OP_BARRIER:
+            {
+                // nop
+            } break;
         case GGML_OP_COUNT:
             {
                 GGML_ASSERT(false);
@@ -16077,6 +16087,66 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
     }
 }
 
+void ggml_graph_find_concurrency(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
+    int search_depth = 40; //we only find concurrency in this range to avoiding waste to much time
+    struct ggml_tensor * nodes_bak[GGML_MAX_NODES]={NULL};
+    struct ggml_tensor * barrier_node;
+    barrier_node = ggml_new_tensor_1d(ctx,GGML_TYPE_F32,0);
+    barrier_node->op=GGML_OP_BARRIER;
+
+    for (int i=0; i < cgraph->n_nodes; i++) {
+        nodes_bak[i] = cgraph->nodes[i];
+        cgraph->nodes[i] = NULL;
+    }
+
+    int n_left = cgraph->n_nodes;
+    int n_start = 0; // all nodes before n_start at nodes_bak array have been sorted and store back to cgraph->nodes
+    int level_pos = 0;  // at cgraph->nodes, the last layer (level) ends at level_pos
+    while (n_left > 0) {
+        // number of nodes at a layer (that can be issued concurrently)
+        int concurrency = 0;
+        for (int i = n_start; i < n_start + search_depth; i++) {
+            if (nodes_bak[i]) {
+
+                // if the requirements for nodes_bak[i] are satisfied
+                int exe_flag=1;
+                // scan all srcs
+                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
+                    struct ggml_tensor * src_cur = nodes_bak[i]->src[src_ind];
+                    if (src_cur) {
+                        // if is leaf nodes it's satisfied.
+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
+                        // otherwise if this src is the output from previous nodes.
+
+                        int is_found = 0;
+                        // scan 2*search_depth back because we insert barrier nodes.
+                        for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+                            if (cgraph->nodes[j] == src_cur) {is_found = 1; break;}
+                        }
+                        if (is_found == 0) {exe_flag = 0; break;}
+                    }
+                }
+                if (exe_flag) {
+                    cgraph->nodes[level_pos + concurrency] = nodes_bak[i];
+                    nodes_bak[i] = NULL;
+                    concurrency++;
+                }
+            }
+        }
+        n_left -= concurrency;
+        // adding a barrier between different layer
+        cgraph->nodes[level_pos + concurrency] = barrier_node;
+        cgraph->n_nodes++;
+        // jump all sorted nodes at nodes_bak
+        while (!nodes_bak[n_start]) {n_start++;}
+        level_pos += concurrency + 1;
+    }
+    //remove the last barrier after result_output
+    cgraph->nodes[cgraph->n_nodes-1] = NULL;
+    cgraph->n_nodes--;
+
+}
+
 void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
     ggml_build_forward_impl(cgraph, tensor, true);
 }
@@ -16721,6 +16791,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 {
                     n_tasks = 1;
                 } break;
+            case GGML_OP_BARRIER:
+                {
+                    // nop
+                } break;
             case GGML_OP_COUNT:
                 {
                     GGML_ASSERT(false);
diff --git a/ggml.h b/ggml.h
index 5023b1652..b0f49face 100644
--- a/ggml.h
+++ b/ggml.h
@@ -194,7 +194,7 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
+#define GGML_MAX_NODES         8192
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
@@ -387,6 +387,8 @@ extern "C" {
         GGML_OP_CROSS_ENTROPY_LOSS,
         GGML_OP_CROSS_ENTROPY_LOSS_BACK,
 
+        GGML_OP_BARRIER, // Any operation between two barriers can be issued concurrently.
+        
         GGML_OP_COUNT,
     };
 
@@ -1363,6 +1365,9 @@ extern "C" {
     GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
     GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
 
+    //sort all nodes in a graph to find operations that can be issued concurrently, insert memory barrier if necessary
+    GGML_API void ggml_graph_find_concurrency(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    
     // print info and performance information for the graph
     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);