diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 449b4e9ec..c2c893780 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1342,17 +1342,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn( // expand the graph nodes without creating leafs. struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { // check if already visited - for (int i = 0; i < g->n_nodes; i++) { - if (g->nodes[i] == t) { - return t; - } - } - - for (int i = 0; i < g->n_leafs; i++) { - if (g->leafs[i] == t) { - return t; - } + if (t->visited) { + return t; } + t->visited = true; for (int i = 0; i < GGML_MAX_SRC; ++i) { if (t->src[i]) { diff --git a/ggml.c b/ggml.c index 9ee4a8d7f..18091e814 100644 --- a/ggml.c +++ b/ggml.c @@ -4594,6 +4594,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, + /*.visited =*/ false, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -15752,17 +15753,11 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } // check if already visited - for (int i = 0; i < cgraph->n_nodes; i++) { - if (cgraph->nodes[i] == node) { - return; - } - } - - for (int i = 0; i < cgraph->n_leafs; i++) { - if (cgraph->leafs[i] == node) { - return; - } + if (node->visited) { + GGML_ASSERT(cgraph->n_nodes > 0 || cgraph->n_leafs > 0); // to fix this, call ggml_graph_close() after building the graph + return; } + node->visited = true; for (int i = 0; i < GGML_MAX_SRC; ++i) { if (node->src[i]) { @@ -15814,13 +15809,28 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten } void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { + GGML_ASSERT(!cgraph->closed && "graph is closed"); ggml_build_forward_impl(cgraph, tensor, true); } +void ggml_graph_close(struct ggml_cgraph * cgraph) { + if (cgraph->closed) { + return; + } + for (int i = 0; i < cgraph->n_nodes; ++i) { + cgraph->nodes[i]->visited = false; + } + for (int i = 0; i < cgraph->n_leafs; ++i) { + cgraph->leafs[i]->visited = false; + } + cgraph->closed = true; +} + struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, + /*.closed =*/ false, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -15865,7 +15875,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg if (node->is_param) { GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); - ggml_build_forward_impl(&result, node->grad, true); + ggml_build_forward_expand(&result, node->grad); } } @@ -16135,6 +16145,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { + ggml_graph_close(cgraph); + if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } diff --git a/ggml.h b/ggml.h index 871c85a89..ed6b36e23 100644 --- a/ggml.h +++ b/ggml.h @@ -427,6 +427,8 @@ extern "C" { struct ggml_tensor * grad; struct ggml_tensor * src[GGML_MAX_SRC]; + bool visited; // used to build graphs + // performance int perf_runs; int64_t perf_cycles; @@ -438,7 +440,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[8]; + char padding[4]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -463,6 +465,7 @@ extern "C" { struct ggml_cgraph { int n_nodes; int n_leafs; + bool closed; struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; @@ -1349,6 +1352,11 @@ extern "C" { GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + // resets the visited flag for all the tensors in the graph + // called by ggml_graph_plan() + // shouldn't be necessary to call manually except building when building multiple graphs without computing them + GGML_API void ggml_graph_close(struct ggml_cgraph * cgraph); + GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); diff --git a/llama.cpp b/llama.cpp index 5a8453bec..cc866295f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1701,6 +1701,8 @@ static bool llama_eval_internal( // logits -> probs //cur = ggml_soft_max_inplace(ctx0, cur); + //fprintf(stderr, "graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0); + // run the computation ggml_build_forward_expand(&gf, cur); @@ -1710,6 +1712,7 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { + ggml_graph_close(&gf); // should only be required for the Metal backend, as ggml_graph_plan() does this automatically ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur);