diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 449b4e9ec..c2c893780 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1342,17 +1342,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
 // expand the graph nodes without creating leafs.
 struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
     // check if already visited
-    for (int i = 0; i < g->n_nodes; i++) {
-        if (g->nodes[i] == t) {
-            return t;
-        }
-    }
-
-    for (int i = 0; i < g->n_leafs; i++) {
-        if (g->leafs[i] == t) {
-            return t;
-        }
+    if (t->visited) {
+        return t;
     }
+    t->visited = true;
 
     for (int i = 0; i < GGML_MAX_SRC; ++i) {
         if (t->src[i]) {
diff --git a/ggml.c b/ggml.c
index 9ee4a8d7f..18091e814 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4594,6 +4594,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.is_param     =*/ false,
         /*.grad         =*/ NULL,
         /*.src          =*/ { NULL },
+        /*.visited      =*/ false,
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
@@ -15752,17 +15753,11 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
     }
 
     // check if already visited
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i] == node) {
-            return;
-        }
-    }
-
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        if (cgraph->leafs[i] == node) {
-            return;
-        }
+    if (node->visited) {
+        GGML_ASSERT(cgraph->n_nodes > 0 || cgraph->n_leafs > 0); // to fix this, call ggml_graph_close() after building the graph
+        return;
     }
+    node->visited = true;
 
     for (int i = 0; i < GGML_MAX_SRC; ++i) {
         if (node->src[i]) {
@@ -15814,13 +15809,28 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
 }
 
 void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    GGML_ASSERT(!cgraph->closed && "graph is closed");
     ggml_build_forward_impl(cgraph, tensor, true);
 }
 
+void ggml_graph_close(struct ggml_cgraph * cgraph) {
+    if (cgraph->closed) {
+        return;
+    }
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        cgraph->nodes[i]->visited = false;
+    }
+    for (int i = 0; i < cgraph->n_leafs; ++i) {
+        cgraph->leafs[i]->visited = false;
+    }
+    cgraph->closed = true;
+}
+
 struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     struct ggml_cgraph result = {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
+        /*.closed       =*/ false,
         /*.nodes        =*/ { NULL },
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
@@ -15865,7 +15875,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 
         if (node->is_param) {
             GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_impl(&result, node->grad, true);
+            ggml_build_forward_expand(&result, node->grad);
         }
     }
 
@@ -16135,6 +16145,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
+    ggml_graph_close(cgraph);
+
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
diff --git a/ggml.h b/ggml.h
index 871c85a89..ed6b36e23 100644
--- a/ggml.h
+++ b/ggml.h
@@ -427,6 +427,8 @@ extern "C" {
         struct ggml_tensor * grad;
         struct ggml_tensor * src[GGML_MAX_SRC];
 
+        bool visited;   // used to build graphs
+
         // performance
         int     perf_runs;
         int64_t perf_cycles;
@@ -438,7 +440,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[8];
+        char padding[4];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -463,6 +465,7 @@ extern "C" {
     struct ggml_cgraph {
         int n_nodes;
         int n_leafs;
+        bool closed;
 
         struct ggml_tensor * nodes[GGML_MAX_NODES];
         struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -1349,6 +1352,11 @@ extern "C" {
 
     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
+    // resets the visited flag for all the tensors in the graph
+    // called by ggml_graph_plan()
+    // shouldn't be necessary to call manually except building when building multiple graphs without computing them
+    GGML_API void ggml_graph_close(struct ggml_cgraph * cgraph);
+
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
diff --git a/llama.cpp b/llama.cpp
index 5a8453bec..cc866295f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1701,6 +1701,8 @@ static bool llama_eval_internal(
     // logits -> probs
     //cur = ggml_soft_max_inplace(ctx0, cur);
 
+    //fprintf(stderr, "graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
+
     // run the computation
     ggml_build_forward_expand(&gf, cur);
 
@@ -1710,6 +1712,7 @@ static bool llama_eval_internal(
 
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
+        ggml_graph_close(&gf); // should only be required for the Metal backend, as ggml_graph_plan() does this automatically
         ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);