diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index c2c893780..449b4e9ec 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1342,10 +1342,17 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( // expand the graph nodes without creating leafs. struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { // check if already visited - if (t->visited) { - return t; + for (int i = 0; i < g->n_nodes; i++) { + if (g->nodes[i] == t) { + return t; + } + } + + for (int i = 0; i < g->n_leafs; i++) { + if (g->leafs[i] == t) { + return t; + } } - t->visited = true; for (int i = 0; i < GGML_MAX_SRC; ++i) { if (t->src[i]) { diff --git a/ggml.c b/ggml.c index 964a6b9c3..9af00ee6e 100644 --- a/ggml.c +++ b/ggml.c @@ -4592,7 +4592,6 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.op =*/ GGML_OP_NONE, /*.op_params =*/ {0}, /*.is_param =*/ false, - /*.visited =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, /*.perf_runs =*/ 0, @@ -15743,6 +15742,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } } +static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small"); + +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static bool hash_insert(void * hash_table[], void * p) { + size_t h = hash(p); + + // linear probing + size_t i = h; + while (hash_table[i] != NULL && hash_table[i] != p) { + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_ASSERT(false); + } + } + + if (hash_table[i] == p) { + return true; + } + + // insert + hash_table[i] = p; + return false; +} + static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { if (node->grad == NULL) { // this usually happens when we generate intermediate nodes from constants in the backward pass @@ -15753,11 +15780,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } // check if already visited - if (node->visited) { - GGML_ASSERT(cgraph->n_nodes > 0 || cgraph->n_leafs > 0); // to fix this, call ggml_graph_close() after building the graph + if (hash_insert(cgraph->visited_hash_table, node)) { return; } - node->visited = true; for (int i = 0; i < GGML_MAX_SRC; ++i) { if (node->src[i]) { @@ -15809,31 +15834,17 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten } void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { - GGML_ASSERT(!cgraph->closed && "graph is closed"); ggml_build_forward_impl(cgraph, tensor, true); } -void ggml_graph_close(struct ggml_cgraph * cgraph) { - if (cgraph->closed) { - return; - } - for (int i = 0; i < cgraph->n_nodes; ++i) { - cgraph->nodes[i]->visited = false; - } - for (int i = 0; i < cgraph->n_leafs; ++i) { - cgraph->leafs[i]->visited = false; - } - cgraph->closed = true; -} - struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.closed =*/ false, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, + /*.hash_table =*/ { NULL }, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -16145,8 +16156,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { - ggml_graph_close(cgraph); - if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } diff --git a/ggml.h b/ggml.h index 3003901ac..b59a847ef 100644 --- a/ggml.h +++ b/ggml.h @@ -422,8 +422,7 @@ extern "C" { // op params - allocated as int32_t for alignment int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)]; - uint32_t is_param:1; - uint32_t visited:1; // used to build graphs + bool is_param; struct ggml_tensor * grad; struct ggml_tensor * src[GGML_MAX_SRC]; @@ -460,16 +459,22 @@ extern "C" { void * abort_callback_data; }; + // next prime after GGML_MAX_NODES + // #define GGML_GRAPH_HASHTABLE_SIZE 4099 + // next prime after GGML_MAX_NODES * 2 (nodes + leafs) + #define GGML_GRAPH_HASHTABLE_SIZE 8273 + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; - bool closed; struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; struct ggml_tensor * leafs[GGML_MAX_NODES]; + void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE]; + // performance int perf_runs; int64_t perf_cycles; @@ -1351,11 +1356,6 @@ extern "C" { GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - // resets the visited flag for all the tensors in the graph - // called by ggml_graph_plan() - // shouldn't be necessary to call manually except building when building multiple graphs without computing them - GGML_API void ggml_graph_close(struct ggml_cgraph * cgraph); - GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); diff --git a/llama.cpp b/llama.cpp index cc866295f..70a3ac9c1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1701,18 +1701,17 @@ static bool llama_eval_internal( // logits -> probs //cur = ggml_soft_max_inplace(ctx0, cur); - //fprintf(stderr, "graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); - // run the computation ggml_build_forward_expand(&gf, cur); + // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs); + #if GGML_USE_MPI ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer); #endif #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { - ggml_graph_close(&gf); // should only be required for the Metal backend, as ggml_graph_plan() does this automatically ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur);