ggml: try to issue operations concurrently on GPU

This commit add a ggml_graph_find_concurrency function to find if some
operations can be issued simultaneously by GPU.

Before sending a graph to the GPU backend we can call the new function
to find concurrency in the graph. This will sort all the nodes and
insert memory barrier nodes if necessary. one can simply dismiss the
barrier nodes and issue operations sequentially, or try to concuurrently
issue all the operations between two barriers.
This commit is contained in:
lshzh-ww 2023-07-21 11:23:18 -04:00
parent c8e6ef1846
commit 1c3030ee41
2 changed files with 82 additions and 3 deletions

78
ggml.c
View file

@ -3807,9 +3807,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS",
"CROSS_ENTROPY_LOSS_BACK",
"BARRIER",
};
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_COUNT == 69, "GGML_OP_COUNT != 69");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -3887,9 +3888,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss(x,y)",
"cross_entropy_loss_back(x,y)",
"memory barrier",
};
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
static_assert(GGML_OP_COUNT == 69, "GGML_OP_COUNT != 69");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -15164,6 +15166,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
// nop
} break;
case GGML_OP_BARRIER:
{
// nop
} break;
case GGML_OP_COUNT:
{
GGML_ASSERT(false);
@ -15999,6 +16005,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
// nop
} break;
case GGML_OP_BARRIER:
{
// nop
} break;
case GGML_OP_COUNT:
{
GGML_ASSERT(false);
@ -16077,6 +16087,66 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
}
}
void ggml_graph_find_concurrency(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
int search_depth = 40; //we only find concurrency in this range to avoiding waste to much time
struct ggml_tensor * nodes_bak[GGML_MAX_NODES]={NULL};
struct ggml_tensor * barrier_node;
barrier_node = ggml_new_tensor_1d(ctx,GGML_TYPE_F32,0);
barrier_node->op=GGML_OP_BARRIER;
for (int i=0; i < cgraph->n_nodes; i++) {
nodes_bak[i] = cgraph->nodes[i];
cgraph->nodes[i] = NULL;
}
int n_left = cgraph->n_nodes;
int n_start = 0; // all nodes before n_start at nodes_bak array have been sorted and store back to cgraph->nodes
int level_pos = 0; // at cgraph->nodes, the last layer (level) ends at level_pos
while (n_left > 0) {
// number of nodes at a layer (that can be issued concurrently)
int concurrency = 0;
for (int i = n_start; i < n_start + search_depth; i++) {
if (nodes_bak[i]) {
// if the requirements for nodes_bak[i] are satisfied
int exe_flag=1;
// scan all srcs
for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
struct ggml_tensor * src_cur = nodes_bak[i]->src[src_ind];
if (src_cur) {
// if is leaf nodes it's satisfied.
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
// otherwise if this src is the output from previous nodes.
int is_found = 0;
// scan 2*search_depth back because we insert barrier nodes.
for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
if (cgraph->nodes[j] == src_cur) {is_found = 1; break;}
}
if (is_found == 0) {exe_flag = 0; break;}
}
}
if (exe_flag) {
cgraph->nodes[level_pos + concurrency] = nodes_bak[i];
nodes_bak[i] = NULL;
concurrency++;
}
}
}
n_left -= concurrency;
// adding a barrier between different layer
cgraph->nodes[level_pos + concurrency] = barrier_node;
cgraph->n_nodes++;
// jump all sorted nodes at nodes_bak
while (!nodes_bak[n_start]) {n_start++;}
level_pos += concurrency + 1;
}
//remove the last barrier after result_output
cgraph->nodes[cgraph->n_nodes-1] = NULL;
cgraph->n_nodes--;
}
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
ggml_build_forward_impl(cgraph, tensor, true);
}
@ -16721,6 +16791,10 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
{
n_tasks = 1;
} break;
case GGML_OP_BARRIER:
{
// nop
} break;
case GGML_OP_COUNT:
{
GGML_ASSERT(false);

7
ggml.h
View file

@ -194,7 +194,7 @@
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
#define GGML_MAX_NODES 8192
#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6
@ -387,6 +387,8 @@ extern "C" {
GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_BARRIER, // Any operation between two barriers can be issued concurrently.
GGML_OP_COUNT,
};
@ -1363,6 +1365,9 @@ extern "C" {
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
//sort all nodes in a graph to find operations that can be issued concurrently, insert memory barrier if necessary
GGML_API void ggml_graph_find_concurrency(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);