diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c2c893780..449b4e9ec 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1342,10 +1342,17 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
 // expand the graph nodes without creating leafs.
 struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
     // check if already visited
-    if (t->visited) {
-        return t;
+    for (int i = 0; i < g->n_nodes; i++) {
+        if (g->nodes[i] == t) {
+            return t;
+        }
+    }
+
+    for (int i = 0; i < g->n_leafs; i++) {
+        if (g->leafs[i] == t) {
+            return t;
+        }
     }
-    t->visited = true;
 
     for (int i = 0; i < GGML_MAX_SRC; ++i) {
         if (t->src[i]) {
diff --git a/ggml.c b/ggml.c
index 964a6b9c3..9af00ee6e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4592,7 +4592,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.op           =*/ GGML_OP_NONE,
         /*.op_params    =*/ {0},
         /*.is_param     =*/ false,
-        /*.visited      =*/ false,
         /*.grad         =*/ NULL,
         /*.src          =*/ { NULL },
         /*.perf_runs    =*/ 0,
@@ -15743,6 +15742,34 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
     }
 }
 
+static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small");
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static bool hash_insert(void * hash_table[], void * p) {
+    size_t h = hash(p);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i] != NULL && hash_table[i] != p) {
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    if (hash_table[i] == p) {
+        return true;
+    }
+
+    // insert
+    hash_table[i] = p;
+    return false;
+}
+
 static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
     if (node->grad == NULL) {
         // this usually happens when we generate intermediate nodes from constants in the backward pass
@@ -15753,11 +15780,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
     }
 
     // check if already visited
-    if (node->visited) {
-        GGML_ASSERT(cgraph->n_nodes > 0 || cgraph->n_leafs > 0); // to fix this, call ggml_graph_close() after building the graph
+    if (hash_insert(cgraph->visited_hash_table, node)) {
         return;
     }
-    node->visited = true;
 
     for (int i = 0; i < GGML_MAX_SRC; ++i) {
         if (node->src[i]) {
@@ -15809,31 +15834,17 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
 }
 
 void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
-    GGML_ASSERT(!cgraph->closed && "graph is closed");
     ggml_build_forward_impl(cgraph, tensor, true);
 }
 
-void ggml_graph_close(struct ggml_cgraph * cgraph) {
-    if (cgraph->closed) {
-        return;
-    }
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        cgraph->nodes[i]->visited = false;
-    }
-    for (int i = 0; i < cgraph->n_leafs; ++i) {
-        cgraph->leafs[i]->visited = false;
-    }
-    cgraph->closed = true;
-}
-
 struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     struct ggml_cgraph result = {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
-        /*.closed       =*/ false,
         /*.nodes        =*/ { NULL },
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
+        /*.hash_table   =*/ { NULL },
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
@@ -16145,8 +16156,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
-    ggml_graph_close(cgraph);
-
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
diff --git a/ggml.h b/ggml.h
index 3003901ac..b59a847ef 100644
--- a/ggml.h
+++ b/ggml.h
@@ -422,8 +422,7 @@ extern "C" {
         // op params - allocated as int32_t for alignment
         int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
 
-        uint32_t is_param:1;
-        uint32_t visited:1;  // used to build graphs
+        bool is_param;
 
         struct ggml_tensor * grad;
         struct ggml_tensor * src[GGML_MAX_SRC];
@@ -460,16 +459,22 @@ extern "C" {
         void * abort_callback_data;
     };
 
+    // next prime after GGML_MAX_NODES
+    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
+    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
+    #define GGML_GRAPH_HASHTABLE_SIZE 8273
+
     // computation graph
     struct ggml_cgraph {
         int n_nodes;
         int n_leafs;
-        bool closed;
 
         struct ggml_tensor * nodes[GGML_MAX_NODES];
         struct ggml_tensor * grads[GGML_MAX_NODES];
         struct ggml_tensor * leafs[GGML_MAX_NODES];
 
+        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+
         // performance
         int     perf_runs;
         int64_t perf_cycles;
@@ -1351,11 +1356,6 @@ extern "C" {
 
     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
-    // resets the visited flag for all the tensors in the graph
-    // called by ggml_graph_plan()
-    // shouldn't be necessary to call manually except building when building multiple graphs without computing them
-    GGML_API void ggml_graph_close(struct ggml_cgraph * cgraph);
-
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
diff --git a/llama.cpp b/llama.cpp
index cc866295f..70a3ac9c1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1701,18 +1701,17 @@ static bool llama_eval_internal(
     // logits -> probs
     //cur = ggml_soft_max_inplace(ctx0, cur);
 
-    //fprintf(stderr, "graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
-
     // run the computation
     ggml_build_forward_expand(&gf, cur);
 
+    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
+
 #if GGML_USE_MPI
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
 #endif
 
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
-        ggml_graph_close(&gf); // should only be required for the Metal backend, as ggml_graph_plan() does this automatically
         ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);