diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 3ce91070b..d786dc94d 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 constexpr float rms_norm_eps = 5e-6f;
 #endif
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 static struct ggml_tensor * randomize_tensor(
     struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
 ) {
@@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
-    std::vector<uint8_t> work_buffer;
-
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
             /*.mem_size   =*/ compute_size,
@@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
 
         ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_graph_prepare(gf, 1, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
@@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
         ggml_opt(ctx0, opt_params_lbfgs, e);
         //
         ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_graph_prepare(gf, 1, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
@@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+            ggml_graph_prepare(gf, 1, nullptr);
+            ggml_graph_work_init(gf, nullptr);
+            ggml_graph_compute(gf);
+            ggml_graph_work_free(gf);
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 97622f4f4..f2e7fa483 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -20,17 +20,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 static float tensor_sum_elements(const ggml_tensor * tensor) {
     double sum = 0;
     if (tensor->type == GGML_TYPE_F32) {
@@ -179,11 +168,10 @@ int main(int argc, char ** argv)  {
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
-    std::vector<uint8_t> work_buffer;
+    ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
+    ggml_graph_work_init(gf, nullptr);
 
-    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
-
-    TENSOR_DUMP(gf->nodes[0]);
+    TENSOR_DUMP(ggml_graph_node(gf, 0));
 
     printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
 
@@ -224,7 +212,7 @@ int main(int argc, char ** argv)  {
 
 
     // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
+    float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
 
     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
     printf("=====================================================================================\n");
@@ -234,7 +222,7 @@ int main(int argc, char ** argv)  {
 
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+        ggml_graph_compute(gf31);
 
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
@@ -252,7 +240,7 @@ int main(int argc, char ** argv)  {
 
         // Check that the matrix multiplication result is in the right ballpark
         // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
+        float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
         float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
         float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
 
@@ -267,8 +255,11 @@ int main(int argc, char ** argv)  {
         }
 
         // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
+        ggml_graph_compute(gf32);
     }
+
+    ggml_graph_work_free(gf);
+
     printf("\n");
     printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
     printf("=====================================================================================\n");
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index 05c66856c..a969c486d 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -226,8 +226,8 @@ static ggml_status compute_piter(
         result.eigenvectors.resize(params.n_batch);
         result.distances.resize(params.n_batch);
         // get output nodes
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            auto node = gf->nodes[i];
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            auto node = ggml_graph_node(gf, i);
             int iter = -1;
             // find b_tensor (without copying data from device)
             if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index ff324926a..90126ad1e 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -370,7 +370,7 @@ struct lora_merge_ctx {
 
         // write data to output file
         {
-            auto result = gf->nodes[gf->n_nodes - 1];
+            auto * result = ggml_graph_node(gf, -1);
             size_t len = ggml_nbytes(result);
             if (read_buf.size() < len) {
                 read_buf.resize(len);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9b890571e..5dfb333d1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     ggml_backend_graph_compute(ctx->backend, gf);
 
     // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
     // copy the embeddings to the location passed by the user
     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 851af0f00..46413c901 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -183,8 +183,10 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
     ggml_build_forward_expand(gf, flatten);
-    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+    ggml_graph_prepare(gf, 1, nullptr);
+    ggml_graph_work_init(gf, model.ctx);
+    ggml_graph_compute(gf);
+    struct ggml_tensor* result = ggml_graph_node(gf, -1);
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 536018b66..684578f98 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -358,6 +358,7 @@ extern "C" {
 
     struct ggml_object;
     struct ggml_context;
+    struct ggml_cgraph;
 
     // NOTE: always add types at the end of the enum to keep backward compatibility
     enum ggml_type {
@@ -575,23 +576,9 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM  = 4,
     };
 
-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type         type;
+        enum ggml_type type;
 
         GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
 
@@ -655,50 +642,7 @@ extern "C" {
 
     struct ggml_threadpool;     // forward declaration, see ggml.c
 
-    typedef struct  ggml_threadpool * ggml_threadpool_t;
-
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-        struct ggml_threadpool * threadpool;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    typedef uint32_t ggml_bitset_t;
-
-    struct ggml_hash_set {
-        size_t size;
-        ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
-        struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_set;
-
-        enum ggml_cgraph_eval_order order;
-    };
+    typedef struct ggml_threadpool * ggml_threadpool_t;
 
     // scratch buffer
     struct ggml_scratch {
@@ -2017,8 +1961,6 @@ extern "C" {
     typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
     typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
 
-    #define GGML_N_TASKS_MAX -1
-
     GGML_API struct ggml_tensor * ggml_map_custom1(
             struct ggml_context   * ctx,
             struct ggml_tensor    * a,
@@ -2088,42 +2030,92 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * tensor);
 
-
     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
     GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
 
-    // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
+
+    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+
+    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
-    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
-    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
-    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
-    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    // TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c
+    //       (unless the code has been moved to a separate source file)
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params * params);
+    GGML_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
 
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
-                  const struct ggml_cgraph * cgraph,
-                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    // =================================================================================================
+    // CPU-only API for ggml_cgraph
+    //
+    // TODO: move to the CPU backend
+    // NOTE: avoid using, will be removed
+    //
 
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    // loops through the graph and determines:
+    //
+    // - work size needed for CPU computation
+    // - number of threads to start
+    //
+    GGML_API enum ggml_status ggml_graph_prepare(
+                struct ggml_cgraph * cgraph,
+                               int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+            struct ggml_threadpool * threadpool /* = NULL */ );
+
+    // get the estimated work size for the graph from ggml_graph_prepare()
+    GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);
+
+    // if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
+    // otherwise, the work buffer will be allocated in the context. no need to free it
+    GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
+    GGML_API void             ggml_graph_work_free(struct ggml_cgraph * cgraph);
+
+    // note: call ggml_graph_prepare() and ggml_graph_work_init() first
+    //
+    // sample usages:
+    //
+    //   - no dynamic allocations:
+    //
+    //      ... prepare ggml_context ctx ...
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, ctx);
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      // no need to call ggml_graph_work_free() because it is allocated in ctx
+    //
+    //  - dynamic allocations:
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, NULL); // will allocate memory
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      ggml_graph_work_free(cgraph);
+    //
+    GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);
+
+    // end of CPU-only API
+    // =================================================================================================
+
+    GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);
 
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
@@ -2146,6 +2138,7 @@ extern "C" {
             struct ggml_cgraph    * gb_tmp,
             struct ggml_tensor  * * checkpoints,
             int                     n_checkpoints);
+
     //
     // optimization
     //
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index b5d9301a7..4787a0bd1 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -752,7 +752,8 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
 }
 
 struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
+    // TODO: move member from ggml_cgraph here when the public CPU-only API is removed
+
     struct ggml_cgraph cgraph;
 };
 
@@ -761,19 +762,19 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+    ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
-        if (cpu_plan->cplan.work_data == NULL) {
+    if (cpu_plan->cgraph.work_size > 0) {
+        cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
+        if (cpu_plan->cgraph.work_data == NULL) {
             free(cpu_plan);
             return NULL;
         }
     }
 
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cpu_plan->cgraph.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;
 
     return cpu_plan;
 }
@@ -781,7 +782,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
-    free(cpu_plan->cplan.work_data);
+    free(cpu_plan->cgraph.work_data);
     free(cpu_plan);
 
     GGML_UNUSED(backend);
@@ -790,7 +791,7 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+    return ggml_graph_compute(&cpu_plan->cgraph);
 
     GGML_UNUSED(backend);
 }
@@ -798,23 +799,24 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
-    if (cpu_ctx->work_size < cplan.work_size) {
+    if (cpu_ctx->work_size < cgraph->work_size) {
         free(cpu_ctx->work_data);
-        cpu_ctx->work_data = malloc(cplan.work_size);
+        cpu_ctx->work_data = malloc(cgraph->work_size);
         if (cpu_ctx->work_data == NULL) {
             cpu_ctx->work_size = 0;
             return GGML_STATUS_ALLOC_FAILED;
         }
-        cpu_ctx->work_size = cplan.work_size;
+        cpu_ctx->work_size = cgraph->work_size;
     }
-    cplan.work_data = cpu_ctx->work_data;
+    cgraph->work_data = cpu_ctx->work_data;
+    cgraph->work_own  = false; // always freed by ggml_backend_cpu_graph_plan_free
 
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cgraph->abort_callback      = cpu_ctx->abort_callback;
+    cgraph->abort_callback_data = cpu_ctx->abort_callback_data;
 
-    return ggml_graph_compute(cgraph, &cplan);
+    return ggml_graph_compute(cgraph);
 }
 
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index 713731735..6d99c6bea 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -1,3 +1,4 @@
+#include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
 
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index 06930ba2e..530b700e3 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -30,6 +30,7 @@
 #include <cstring>
 #include <mutex>
 
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index d53de4edd..54f1a7c2d 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1,5 +1,5 @@
 #include "ggml-cuda.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include "ggml-cuda/common.cuh"
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 961f3c67b..31a46ea3e 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 
+enum ggml_cgraph_eval_order {
+    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
 // bitset
 
+typedef uint32_t ggml_bitset_t;
+
 static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
 #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
 #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
@@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
 #define GGML_HASHSET_FULL ((size_t)-1)
 #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
 
+struct ggml_hash_set {
+    size_t size;
+    ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+    struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
+};
+
 struct ggml_hash_set ggml_hash_set_new(size_t size);
 void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);
 
@@ -745,6 +759,37 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
     GGML_ABORT("fatal error");
 }
 
+// computation graph
+
+struct ggml_cgraph {
+    int size;
+    int n_nodes;
+    int n_leafs;
+
+    struct ggml_tensor ** nodes;
+    struct ggml_tensor ** grads;
+    struct ggml_tensor ** leafs;
+
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+
+    // TODO: after the CPU-only API is removed, we can move the members below to ggml_backend_plan_cpu
+
+    bool      work_own;
+    size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+    uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+    int n_threads;
+    struct ggml_threadpool * threadpool;
+
+    // abort ggml_graph_compute when true
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
+};
+
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 41ac63fa4..7f0bd82d5 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-kompute.h"
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 6d8a7c898..6c85acfec 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1,7 +1,7 @@
 #import "ggml-metal.h"
 
+#import "ggml-impl.h"
 #import "ggml-backend-impl.h"
-#import "ggml.h"
 
 #import <Foundation/Foundation.h>
 
@@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
-    const int n_nodes  = gf->n_nodes;
+    const int n_nodes = gf->n_nodes;
     const int n_cb = ctx->n_cb;
     const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
 
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index 9c600c7ca..a8a2eb85a 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -1,5 +1,5 @@
 #include "ggml-rpc.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include <cinttypes>
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 4f03b01e7..be56f8180 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -33,7 +33,7 @@
 #include <sycl/half_type.hpp>
 
 #include "ggml-sycl.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include "ggml-sycl/backend.hpp"
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 83737c1d9..bad960510 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -21,7 +21,7 @@
 #include <memory>
 #include <mutex>
 
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include "ggml-vulkan-shaders.hpp"
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d7157ca6d..643821b5b 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
+#define GGML_N_TASKS_MAX (-1)
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
 #define GGML_F32x4_ADD          vaddq_f32
 #define GGML_F32x4_MUL          vmulq_f32
 #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
 }
 
 #define GGML_F32_VEC        GGML_F32x4
@@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
     #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
     #define GGML_F16x8_ADD          vaddq_f16
     #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                             \
-    do {                                                          \
-        int offset = GGML_F16_ARR >> 1;                           \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    #define GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
     } while (0)
 
     #define GGML_F16_VEC                GGML_F16x8
     #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
     #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
     #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
     #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
     #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
     #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
@@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
 #endif
 
+//
+// ggml object
+//
+
+struct ggml_object {
+    size_t offs;
+    size_t size;
+
+    struct ggml_object * next;
+
+    enum ggml_object_type type;
+
+    char padding[4];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
 //
 // ggml context
 //
@@ -1983,7 +2001,6 @@ struct ggml_threadpool {
     ggml_cond_t  cond;        // cond.var for waiting for new work
 
     struct ggml_cgraph * cgraph;
-    struct ggml_cplan  * cplan;
 
     // synchronization primitives
     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
@@ -19071,14 +19088,21 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
     assert(obj_size == (size_t)((char *)p - (char *)cgraph));
 
     *cgraph = (struct ggml_cgraph) {
-        /*.size         =*/ size,
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ nodes_ptr,
-        /*.grads        =*/ grads_ptr,
-        /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.size                =*/ size,
+        /*.n_nodes             =*/ 0,
+        /*.n_leafs             =*/ 0,
+        /*.nodes               =*/ nodes_ptr,
+        /*.grads               =*/ grads_ptr,
+        /*.leafs               =*/ leafs_ptr,
+        /*.visited_hash_set    =*/ { hash_size, hash_used, hash_keys_ptr },
+        /*.order               =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.work_own            =*/ false,
+        /*.work_size           =*/ 0,
+        /*.work_data           =*/ NULL,
+        /*.n_threads           =*/ GGML_DEFAULT_N_THREADS,
+        /*.threadpool          =*/ NULL,
+        /*.abort_callback      =*/ NULL,
+        /*.abort_callback_data =*/ NULL,
     };
 
     ggml_hash_set_reset(&cgraph->visited_hash_set);
@@ -19092,14 +19116,21 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
 
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
     struct ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL, NULL },
-        /*.order        =*/ cgraph0->order,
+        /*.size                =*/ 0,
+        /*.n_nodes             =*/ i1 - i0,
+        /*.n_leafs             =*/ 0,
+        /*.nodes               =*/ cgraph0->nodes + i0,
+        /*.grads               =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
+        /*.leafs               =*/ NULL,
+        /*.hash_table          =*/ { 0, NULL, NULL },
+        /*.order               =*/ cgraph0->order,
+        /*.work_own            =*/ false,
+        /*.work_size           =*/ 0,
+        /*.work_data           =*/ NULL,
+        /*.n_threads           =*/ GGML_DEFAULT_N_THREADS,
+        /*.threadpool          =*/ NULL,
+        /*.abort_callback      =*/ NULL,
+        /*.abort_callback_data =*/ NULL,
     };
 
     return cgraph;
@@ -19161,6 +19192,33 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
     ggml_hash_set_reset(&cgraph->visited_hash_set);
 }
 
+int ggml_graph_size(struct ggml_cgraph * cgraph) {
+    return cgraph->size;
+}
+
+struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
+    if (i < 0) {
+        GGML_ASSERT(cgraph->n_nodes + i >= 0);
+        return cgraph->nodes[cgraph->n_nodes + i];
+    }
+
+    GGML_ASSERT(i < cgraph->n_nodes);
+    return cgraph->nodes[i];
+}
+
+struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->nodes;
+}
+
+int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->n_nodes;
+}
+
+void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    cgraph->nodes[cgraph->n_nodes] = tensor;
+    cgraph->n_nodes++;
+}
+
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
@@ -19708,11 +19766,10 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
 #endif
 }
 
-struct ggml_cplan ggml_graph_plan(
-          const struct ggml_cgraph * cgraph,
-                           int       n_threads,
-    struct ggml_threadpool * threadpool) {
-
+enum ggml_status ggml_graph_prepare(
+            struct ggml_cgraph * cgraph,
+                           int   n_threads,
+        struct ggml_threadpool * threadpool) {
     if (threadpool == NULL) {
         GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
     }
@@ -19722,9 +19779,6 @@ struct ggml_cplan ggml_graph_plan(
 
     size_t work_size = 0;
 
-    struct ggml_cplan cplan;
-    memset(&cplan, 0, sizeof(struct ggml_cplan));
-
     int max_tasks = 1;
 
     // thread scheduling for the different operations + work buffer size estimation
@@ -19876,28 +19930,63 @@ struct ggml_cplan ggml_graph_plan(
         work_size += CACHE_LINE_SIZE*(n_threads);
     }
 
-    cplan.threadpool = threadpool;
-    cplan.n_threads  = MIN(max_tasks, n_threads);
-    cplan.work_size  = work_size;
-    cplan.work_data  = NULL;
+    cgraph->threadpool = threadpool;
+    cgraph->n_threads  = MIN(max_tasks, n_threads);
+    cgraph->work_size  = work_size;
 
-    return cplan;
+    ggml_graph_work_free(cgraph);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph) {
+    return cgraph->work_size;
+}
+
+enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx) {
+    GGML_ASSERT(cgraph->n_threads > 0 && "call ggml_graph_prepare first");
+
+    ggml_graph_work_free(cgraph);
+
+    if (cgraph->work_size > 0) {
+        if (ctx == NULL) {
+            cgraph->work_data = GGML_ALIGNED_MALLOC(cgraph->work_size);
+            if (cgraph->work_data == NULL) {
+                return GGML_STATUS_ALLOC_FAILED;
+            }
+
+            cgraph->work_own = true;
+        } else {
+            struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cgraph->work_size);
+
+            cgraph->work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+            cgraph->work_own  = false;
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+void ggml_graph_work_free(struct ggml_cgraph * cgraph) {
+    if (cgraph->work_data && cgraph->work_own) {
+        GGML_ALIGNED_FREE(cgraph->work_data);
+        cgraph->work_data = NULL;
+    }
 }
 
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
     const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
-    const struct ggml_cplan  * cplan  = state->threadpool->cplan;
 
     set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
-        /*.ith       =*/ state->ith,
-        /*.nth       =*/ state->threadpool->n_threads_cur,
-        /*.wsize     =*/ cplan->work_size,
-        /*.wdata     =*/ cplan->work_data,
-        /*.threadpool=*/ state->threadpool,
+        /*.ith        =*/ state->ith,
+        /*.nth        =*/ state->threadpool->n_threads_cur,
+        /*.wsize      =*/ cgraph->work_size,
+        /*.wdata      =*/ cgraph->work_data,
+        /*.threadpool =*/ state->threadpool,
     };
 
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@@ -19905,7 +19994,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         ggml_compute_forward(&params, node);
 
-        if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+        if (state->ith == 0 && cgraph->abort_callback && cgraph->abort_callback(cgraph->abort_callback_data)) {
             state->threadpool->ec = GGML_STATUS_ABORTED;
         }
 
@@ -20059,14 +20148,12 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
 
 static struct ggml_threadpool * ggml_threadpool_new_impl(
     struct ggml_threadpool_params * tpp,
-               struct ggml_cgraph * cgraph,
-                struct ggml_cplan * cplan) {
+               struct ggml_cgraph * cgraph) {
 
     struct ggml_threadpool * threadpool =
         GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
     {
         threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
         threadpool->n_graph          = 0;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
@@ -20124,16 +20211,15 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
 }
 
 struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
-    return ggml_threadpool_new_impl(tpp, NULL, NULL);
+    return ggml_threadpool_new_impl(tpp, NULL);
 }
 
-enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
-    GGML_ASSERT(cplan);
-    GGML_ASSERT(cplan->n_threads > 0);
-    GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
+enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph) {
+    GGML_ASSERT((cgraph->n_threads > 0                              ) && "call ggml_graph_prepare first");
+    GGML_ASSERT((cgraph->work_size == 0 || cgraph->work_data != NULL) && "call ggml_graph_work_init first");
 
-    int n_threads                               = cplan->n_threads;
-    struct ggml_threadpool * threadpool = cplan->threadpool;
+    int n_threads = cgraph->n_threads;
+    struct ggml_threadpool * threadpool = cgraph->threadpool;
 
     bool disposable_threadpool = false;
 
@@ -20142,19 +20228,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         disposable_threadpool = true;
 
         struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
-        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+        threadpool = ggml_threadpool_new_impl(&ttp, cgraph);
     } else {
         // Reset some of the parameters that need resetting
         // No worker threads should be accessing the parameters below at this stage
-        threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
-        threadpool->n_threads_cur    = n_threads;
-        threadpool->current_chunk    = 0;
-        threadpool->ec               = GGML_STATUS_SUCCESS;
+        threadpool->cgraph        = cgraph;
+        threadpool->n_threads_cur = n_threads;
+        threadpool->current_chunk = 0;
+        threadpool->ec            = GGML_STATUS_SUCCESS;
     }
 
     if (n_threads > threadpool->n_threads_max) {
-        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+        GGML_PRINT("WARNING: cgraph is requesting more threads than the threadpool contains. Expect a bad time!\n");
     }
 
 #ifdef GGML_USE_OPENMP
@@ -20193,14 +20278,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     return ret;
 }
 
-enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
-
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    return ggml_graph_compute(cgraph, &cplan);
+void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data) {
+    cgraph->abort_callback = abort_callback;
+    cgraph->abort_callback_data = abort_data;
 }
 
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@@ -21010,9 +21090,8 @@ static enum ggml_opt_result ggml_opt_adam(
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_prepare  (gb, params.n_threads, NULL);
+    ggml_graph_work_init(gb, ctx);
 
     bool cancel = false;
 
@@ -21028,7 +21107,7 @@ static enum ggml_opt_result ggml_opt_adam(
         }
         // ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(gb, &cplan);
+        ggml_graph_compute(gb);
         ggml_opt_acc_grad(np, ps, g, accum_norm);
         fx += ggml_get_f32_1d(f, 0);
     }
@@ -21119,7 +21198,7 @@ static enum ggml_opt_result ggml_opt_adam(
             }
             // ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
+            ggml_graph_compute(gb);
             ggml_opt_acc_grad(np, ps, g, accum_norm);
             fx += ggml_get_f32_1d(f, 0);
         }
@@ -21204,7 +21283,6 @@ static enum ggml_opt_result linesearch_backtracking(
         const float * xp,
         struct ggml_tensor * f,
         struct ggml_cgraph * gb,
-        struct ggml_cplan  * cplan,
         const int np,
         struct ggml_tensor * ps[],
         bool * cancel,
@@ -21261,7 +21339,7 @@ static enum ggml_opt_result linesearch_backtracking(
                 }
                 // ggml_graph_reset  (gf);
                 ggml_set_f32      (f->grad, 1.0f);
-                ggml_graph_compute(gb, cplan);
+                ggml_graph_compute(gb);
                 ggml_opt_acc_grad(np, ps, g, accum_norm);
                 *fx += ggml_get_f32_1d(f, 0);
             }
@@ -21357,9 +21435,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         opt->iter = iter;
     }
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_prepare  (gb, params.n_threads, NULL);
+    ggml_graph_work_init(gb, ctx);
 
     float * x  = opt->lbfgs.x->data;  // current parameters
     float * xp = opt->lbfgs.xp->data; // previous parameters
@@ -21404,7 +21481,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
             }
             // ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
+            ggml_graph_compute(gb);
             ggml_opt_acc_grad(np, ps, g, accum_norm);
             fx += ggml_get_f32_1d(f, 0);
         }
@@ -21470,7 +21547,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         //       to determine if the optimization should be cancelled
         //       this is a simple change, but not doing this atm, since I don't have a nice
         //       way to test and don't want to break something with so many changes lined up
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, np, ps, &cancel, callback, callback_data);
         if (cancel) {
             return GGML_OPT_RESULT_CANCEL;
         }
diff --git a/src/llama.cpp b/src/llama.cpp
index 40db03517..a10341b73 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9877,8 +9877,8 @@ struct llm_build_context {
     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
         // find result_norm tensor for input
         struct ggml_tensor * inp = nullptr;
-        for (int i = gf->n_nodes - 1; i >= 0; --i) {
-            inp = gf->nodes[i];
+        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = ggml_graph_node(gf, i);
             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                 break;
             } else {
@@ -16205,8 +16205,8 @@ static int llama_decode_internal(
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
         // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
+        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
 
         if (lctx.n_outputs == 0) {
             // no output
@@ -16215,9 +16215,9 @@ static int llama_decode_internal(
         } else if (cparams.embeddings) {
             res  = nullptr; // do not extract logits for embedding case
             embd = nullptr;
-            for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    embd = ggml_graph_node(gf, i);
                     break;
                 }
             }
@@ -16432,15 +16432,15 @@ static int llama_encode_internal(
     // there are two cases here
     if (llama_model_has_decoder(&lctx.model)) {
         // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = gf->nodes[gf->n_nodes - 1];
+        embd = ggml_graph_node(gf, -1);
         GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
     } else {
         // second case is an encoder-only T5 model
         if (cparams.embeddings) {
             // only output embeddings if required
-            embd = gf->nodes[gf->n_nodes - 1];
+            embd = ggml_graph_node(gf, -1);
             if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+                embd = ggml_graph_node(gf, -2);
             }
             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
         }
@@ -18488,7 +18488,7 @@ struct llama_context * llama_new_context_with_model(
 
             // note: the number of splits during measure is higher than during inference due to the kv shift
             int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
+            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf));
             LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
         }
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 635de01d7..aa7896def 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -519,7 +519,7 @@ struct test_case {
 
         // add sentinels as graph nodes so that they are checked in the callback
         for (ggml_tensor * sentinel : sentinels) {
-            gf->nodes[gf->n_nodes++] = sentinel;
+            ggml_graph_add_node(gf, sentinel);
         }
 
         // randomize tensors
@@ -679,9 +679,9 @@ struct test_case {
 
         // duplicate the op
         size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
+        int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
         for (int i = 1; i < n_runs; i++) {
-            gf->nodes[gf->n_nodes++] = out;
+            ggml_graph_add_node(gf, out);
         }
 
         // calculate memory
@@ -696,11 +696,11 @@ struct test_case {
             }
             return size;
         };
-        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
                 continue;
             }
-            mem += tensor_op_size(gf->nodes[i]);
+            mem += tensor_op_size(ggml_graph_node(gf, i));
         }
 
         // run
@@ -804,7 +804,7 @@ struct test_case {
         ggml_graph_cpy(gf, gb);
         ggml_build_backward_expand(ctx, gf, gb, false);
         if (expect.size() != 1 || expect[0] != 0.0f) {
-            GGML_ASSERT(gb->n_nodes > gf->n_nodes);
+            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
                 GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
             }
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 1834c11d8..f47cc5299 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -242,12 +242,16 @@ static bool check_gradient(
     ggml_graph_cpy(gf, gb);
     ggml_build_backward_expand(ctx0, gf, gb, false);
 
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+    ggml_graph_prepare(gf, n_threads, nullptr);
+    ggml_graph_work_init(gf, ctx0);
+    ggml_graph_compute(gf);
 
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
+    ggml_graph_prepare(gb, n_threads, nullptr);
+    ggml_graph_work_init(gb, ctx0);
+    ggml_graph_compute(gb);
 
     // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
     // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
@@ -262,13 +266,17 @@ static bool check_gradient(
             const float xp = x0 + eps;
             ggml_set_f32_1d(x[i], k, xp);
 
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+            ggml_graph_prepare(gf, n_threads, nullptr);
+            ggml_graph_work_init(gf, ctx0);
+            ggml_graph_compute(gf);
 
             const double f0 = ggml_get_f32_1d(f, 0);
 
             ggml_set_f32_1d(x[i], k, xm);
 
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+            ggml_graph_prepare(gf, n_threads, nullptr);
+            ggml_graph_work_init(gf, ctx0);
+            ggml_graph_compute(gf);
 
             const double f1 = ggml_get_f32_1d(f, 0);
             const double g0 = (f0 - f1)/(2.0*(double) eps);
@@ -301,7 +309,9 @@ static bool check_gradient(
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
+            ggml_graph_prepare(gb, n_threads, nullptr);
+            ggml_graph_work_init(gb, ctx0);
+            ggml_graph_compute(gb);
 
             const double g1 = ggml_get_f32_1d(x[i]->grad, k);
 
diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp
index 546ca230b..8bc10da7f 100644
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@@ -113,7 +113,10 @@ int main(void) {
     ggml_build_forward_expand(ge, e);
     ggml_graph_reset(ge);
 
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    ggml_graph_prepare(ge, 1, nullptr);
+    ggml_graph_work_init(ge, nullptr);
+    ggml_graph_compute(ge);
+    ggml_graph_work_free(ge);
 
     const float fe = ggml_get_f32_1d(e, 0);
     printf("%s: e = %.4f\n", __func__, fe);
@@ -124,7 +127,10 @@ int main(void) {
 
     ggml_graph_reset(ge);
 
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
+    ggml_graph_prepare(ge, 1, nullptr);
+    ggml_graph_work_init(ge, nullptr);
+    ggml_graph_compute(ge);
+    ggml_graph_work_free(ge);
 
     const float fe_opt = ggml_get_f32_1d(e, 0);
     printf("%s: original  e = %.4f\n", __func__, fe);
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 246bb227d..6abcdf25c 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -112,17 +112,6 @@ static struct ggml_tensor * get_random_tensor_f32(
     return result;
 }
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 int main(int /*argc*/, const char ** /*argv*/) {
     struct ggml_init_params params = {
         /* .mem_size   = */ 128*1024*1024,
@@ -130,8 +119,6 @@ int main(int /*argc*/, const char ** /*argv*/) {
         /* .no_alloc   = */ false,
     };
 
-    std::vector<uint8_t> work_buffer;
-
     struct ggml_context * ctx0 = ggml_init(params);
 
     struct ggml_tensor * x;
@@ -175,7 +162,10 @@ int main(int /*argc*/, const char ** /*argv*/) {
         ggml_build_forward_expand(gf, r1);
         ggml_build_forward_expand(gf, r2);
 
-        ggml_graph_compute_helper(work_buffer, gf, 4);
+        ggml_graph_prepare(gf, 4, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);
 
         // check that r1 and r2 are the same
         {