rewrite: no longer consider backward compitability; plan and make_plan

2023-07-03 16:00:47 +08:00 · 2023-07-03 16:00:47 +08:00 · b11ac01f6b
commit b11ac01f6b
parent a1e7c69228
8 changed files with 404 additions and 165 deletions
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1586,7 +1586,6 @@ int main(int argc, char ** argv) {
        int n_past = 0;

        ggml_cgraph gf = {};
-        gf.n_threads = 1;

        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);

@ -1595,7 +1594,18 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);

        ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }

        float error_before_opt = ggml_get_f32_1d(e, 0);

@ -1611,7 +1621,18 @@ int main(int argc, char ** argv) {
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
        ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }

        float error_after_opt = ggml_get_f32_1d(e, 0);

@ -1659,13 +1680,23 @@ int main(int argc, char ** argv) {
            struct ggml_context * ctx0 = ggml_init(params);

            ggml_cgraph gf = {};
-            gf.n_threads = 1;

            int n_past = 0;
            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);

            ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -159,13 +159,22 @@ int main(int argc, char ** argv)  {
    // printf("Creating compute graph\n");
    struct ggml_cgraph gf = ggml_build_forward(m11xm2);

-    gf.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);

    TENSOR_DUMP(m11);
    TENSOR_DUMP(m2);

-    ggml_graph_compute(ctx, &gf);
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gf);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }

    TENSOR_DUMP(gf.nodes[0]);

@ -187,7 +196,6 @@ int main(int argc, char ** argv)  {

    // printf("Creating compute graph\n");
    struct ggml_cgraph gf31 = ggml_build_forward(q31);
-    gf31.n_threads=benchmark_params.n_threads;

    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
@ -199,8 +207,7 @@ int main(int argc, char ** argv)  {

    //printf("Creating compute graph\n");
    struct ggml_cgraph gf32 = ggml_build_forward(q32);
-    gf32.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf31.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);

    const int dimx = sizex;
    const int dimy = sizey;
@ -221,14 +228,25 @@ int main(int argc, char ** argv)  {

        long long int start = ggml_time_us();
        //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute(ctx, &gf31);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf31);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
+
        long long int stop = ggml_time_us();
        long long int usec = stop-start;
        double gflops = (double)(flops_per_matrix)/usec/1000.0;
        gflops_sum += gflops;
        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
            i,
-            gf31.n_threads,
+            benchmark_params.n_threads,
            sizex, sizey, sizez, flops_per_matrix,
            usec,gflops);

@ -253,7 +271,17 @@ int main(int argc, char ** argv)  {
        }

        // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute(ctx, &gf32);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf32);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
    }
    printf("\n");
    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -3215,9 +3215,6 @@ int main(int argc, char ** argv) {
        struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
        struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;

-        // ggml_cgraph gf = {};
-        gf->n_threads = params.n_threads;
-        gb->n_threads = params.n_threads;

        get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);

@ -3246,7 +3243,17 @@ int main(int argc, char ** argv) {
            *gb = ggml_build_backward(ctx0, gf, true);
        }

-        ggml_graph_compute(ctx0, gf);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }

        size_t used_mem_before_opt = ggml_used_mem(ctx0);

@ -3270,7 +3277,17 @@ int main(int argc, char ** argv) {
        model.train_samples += n_batch;
        model.train_tokens  += n_batch * n_tokens;

-        ggml_graph_compute(ctx0, gf);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }

        float error_after_opt = ggml_get_f32_1d(loss, 0);

@ -3352,13 +3369,23 @@ int main(int argc, char ** argv) {
            struct ggml_context * ctx0 = ggml_init(cparams);

            ggml_cgraph gf = {};
-            gf.n_threads = params.n_threads;

            int n_past = 0;
            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);

            ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
            //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
--- a/ggml.c
+++ b/ggml.c
@ -4583,14 +4583,13 @@ struct ggml_tensor * ggml_new_tensor_impl(
        /*.src0         =*/ NULL,
        /*.src1         =*/ NULL,
        /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
        /*.perf_runs    =*/ 0,
        /*.perf_cycles  =*/ 0,
        /*.perf_time_us =*/ 0,
        /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
        /*.name         =*/ { 0 },
        /*.extra        =*/ NULL,
-        /*.pad          =*/ { 0 },
+        /*.padding      =*/ { 0 },
    };

    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
@ -15772,7 +15771,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
    struct ggml_cgraph result = {
        /*.n_nodes      =*/ 0,
        /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
        /*.nodes        =*/ { NULL },
        /*.grads        =*/ { NULL },
        /*.leafs        =*/ { NULL },
@ -15944,7 +15942,7 @@ void clear_numa_thread_affinity(void) {}

 struct ggml_compute_state_shared {
    struct ggml_cgraph * cgraph;
-    struct ggml_cgraph_context * cgraph_ctx;
+    struct ggml_graph_compute_plan * cgraph_ctx;

    int64_t perf_node_start_cycles;
    int64_t perf_node_start_time_us;
@ -15974,7 +15972,9 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
    struct ggml_cgraph * cgraph = state->shared->cgraph;
-    struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
+
+    struct ggml_graph_compute_plan * ctx = state->shared->cgraph_ctx;
+    const int *n_tasks_arr = ctx->n_tasks;

    const int n_threads = state->shared->n_threads;
    set_numa_thread_affinity(state->ith, n_threads);
@ -15997,7 +15997,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                /* FINALIZE */
                struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
                if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = node->n_tasks;
+                    params.nth = n_tasks_arr[node_n];
                    ggml_compute_forward(&params, node);
                    ggml_graph_compute_perf_stats_node(node, state->shared);
                }
@ -16008,11 +16008,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);

                struct ggml_tensor * node = cgraph->nodes[node_n];
+                const int n_tasks = n_tasks_arr[node_n];

                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                state->shared->perf_node_start_time_us = ggml_perf_time_us();

-                params.nth = node->n_tasks;
+                params.nth = n_tasks;

                /* INIT */
                if (GGML_OP_HAS_INIT[node->op]) {
@ -16020,7 +16021,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                    ggml_compute_forward(&params, node);
                }

-                if (node->n_tasks == 1) {
+                if (n_tasks == 1) {
                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                    // they do something more efficient than spinning (?)
                    params.type = GGML_TASK_COMPUTE;
@ -16052,16 +16053,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

        /* COMPUTE */
        struct ggml_tensor * node = cgraph->nodes[node_n];
+        const int n_tasks = n_tasks_arr[node_n];

        struct ggml_compute_params params = {
            /*.type  =*/ GGML_TASK_COMPUTE,
            /*.ith   =*/ state->ith,
-            /*.nth   =*/ node->n_tasks,
+            /*.nth   =*/ n_tasks,
            /*.wsize =*/ ctx->work_size,
            /*.wdata =*/ ctx->work_data,
        };

-        if (state->ith < node->n_tasks) {
+        if (state->ith < n_tasks) {
            ggml_compute_forward(&params, node);
        }
    }
@ -16070,15 +16072,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }

 // Prepare for graph computing.
-// Will set: node->n_tasks, ctx->{work_size, planned}
-void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
-    GGML_ASSERT(ctx);
-    // This function is actually reentrant, but duplicate calls is unnecessary.
-    GGML_ASSERT(ctx->work_size == 0);
-    GGML_ASSERT(ctx->work_data == NULL);
-    GGML_ASSERT(!ctx->planned);
+struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) {
+    if (n_threads <= 0) {
+        n_threads = GGML_DEFAULT_N_THREADS;
+    }

-    int n_threads = cgraph->n_threads;
+    struct ggml_graph_compute_plan ctx;
+    memset(&ctx, 0, sizeof(struct ggml_graph_compute_plan));
+    int * n_tasks = ctx.n_tasks;
    size_t work_size = 0;

    // initialize tasks + work buffer
@ -16091,11 +16092,11 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                case GGML_OP_CPY:
                case GGML_OP_DUP:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        size_t cur = 0;
                        if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks[i];
                        }

                        work_size = MAX(work_size, cur);
@ -16103,24 +16104,24 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                case GGML_OP_ADD:
                case GGML_OP_ADD1:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        size_t cur = 0;

                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks[i];
                        }

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_ACC:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        size_t cur = 0;

                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks[i];
                        }

                        work_size = MAX(work_size, cur);
@ -16144,7 +16145,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                case GGML_OP_ELU:
                case GGML_OP_RELU:
                    {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                    } break;
                case GGML_OP_MUL:
                case GGML_OP_GELU:
@ -16155,32 +16156,32 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                case GGML_OP_RMS_NORM:
                case GGML_OP_RMS_NORM_BACK:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
                    } break;
                case GGML_OP_MUL_MAT:
                case GGML_OP_OUT_PROD:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        // TODO: use different scheduling for different matrix sizes
                        //const int nr0 = ggml_nrows(node->src0);
                        //const int nr1 = ggml_nrows(node->src1);

-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
+                        //n_tasks[i] = MIN(n_threads, MAX(1, nr0/128));
+                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, n_tasks[i]);

                        size_t cur = 0;
                        const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;

 #if defined(GGML_USE_CUBLAS)
                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                            n_tasks[i] = 1; // TODO: this actually is doing nothing
                                                //       the threads are still spinning
                        }
                        else
 #elif defined(GGML_USE_CLBLAST)
                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                            n_tasks[i] = 1; // TODO: this actually is doing nothing
                                                //       the threads are still spinning
                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
                        }
@ -16188,7 +16189,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                        if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                            n_tasks[i] = 1; // TODO: this actually is doing nothing
                                               //       the threads are still spinning
                            if (node->src0->type != GGML_TYPE_F32) {
                                // here we need memory just for single 2D matrix from src0
@ -16206,7 +16207,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                    } break;
                case GGML_OP_SCALE:
                    {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                    } break;
                case GGML_OP_SET:
                case GGML_OP_CONT:
@ -16219,7 +16220,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                case GGML_OP_DIAG:
                case GGML_OP_DIAG_MASK_ZERO:
                    {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                    } break;
                case GGML_OP_DIAG_MASK_INF:
                case GGML_OP_SOFT_MAX:
@ -16227,19 +16228,19 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                case GGML_OP_ROPE:
                case GGML_OP_ROPE_BACK:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
                    } break;
                case GGML_OP_ALIBI:
                    {
-                        node->n_tasks = 1; //TODO
+                        n_tasks[i] = 1; //TODO
                    } break;
                case GGML_OP_CLAMP:
                    {
-                        node->n_tasks = 1; //TODO
+                        n_tasks[i] = 1; //TODO
                    } break;
                case GGML_OP_CONV_1D:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        GGML_ASSERT(node->src0->ne[3] == 1);
                        GGML_ASSERT(node->src1->ne[2] == 1);
@ -16268,7 +16269,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                    } break;
                case GGML_OP_CONV_2D:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        GGML_ASSERT(node->src1->ne[3] == 1);

@ -16303,45 +16304,45 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                    } break;
                case GGML_OP_FLASH_ATTN:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        size_t cur = 0;

                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);

                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
                        }

                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
                        }

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_FLASH_FF:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        size_t cur = 0;

                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
                        }

                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
                        }

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_FLASH_ATTN_BACK:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

                        size_t cur = 0;

@ -16349,13 +16350,13 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
                        }

                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
                        }

                        work_size = MAX(work_size, cur);
@ -16368,27 +16369,27 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                case GGML_OP_MAP_CUSTOM2:
                case GGML_OP_MAP_CUSTOM3:
                    {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                    } break;
                case GGML_OP_CROSS_ENTROPY_LOSS:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

-                        size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
+                        size_t cur = ggml_type_size(node->type)*(n_tasks[i] + node->src0->ne[0]*n_tasks[i]);

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                    {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;

-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
+                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks[i];

                        work_size = MAX(work_size, cur);
                    } break;
                case GGML_OP_NONE:
                    {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                    } break;
                case GGML_OP_COUNT:
                    {
@ -16402,35 +16403,31 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
        work_size += CACHE_LINE_SIZE*(n_threads - 1);
    }

-    ctx->work_size = work_size;
-    ctx->work_data = NULL;
-    ctx->planned = true;
+    ctx.n_threads = n_threads;
+    ctx.work_size = work_size;
+    ctx.work_data = NULL;
+
+    return ctx;
 }

-void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
-    if (ctx == NULL) {
-        ctx = alloca(sizeof(struct ggml_cgraph_context));
+void ggml_graph_compute(struct ggml_graph_compute_plan * ctx, struct ggml_cgraph * cgraph) {
+    {
        GGML_ASSERT(ctx);
-        ctx->work_size = 0;
-        ctx->work_data = NULL;
-        ctx->planned   = false;
-    } else {
-        // The work_size and work_data MAY have default values even if has been planned.
+        GGML_ASSERT(ctx->n_threads > 0);
+
        if (ctx->work_size > 0) {
            GGML_ASSERT(ctx->work_data);
        }
-    }

-    if (!ctx->planned) {
-        ggml_graph_compute_plan(ctx, cgraph);
-        if (ctx->work_size > 0) {
-            ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
-            GGML_ASSERT(ctx->work_data);
-            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            if (cgraph->nodes[i]->op != GGML_OP_NONE) {
+                GGML_ASSERT(ctx->n_tasks[i] > 0);
+            }
        }
+
    }

-    const int n_threads = cgraph->n_threads;
+    const int n_threads = ctx->n_threads;

    struct ggml_compute_state_shared state_shared = {
        /*.cgraph                  =*/ cgraph,
@ -16494,12 +16491,6 @@ void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph
    }
 }

-// Deprecated, keep it only for backward compatibility.
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    UNUSED(ctx);
-    ggml_graph_compute_v2(NULL, cgraph);
-}
-
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * grad = cgraph->grads[i];
@ -16548,14 +16539,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
    const int64_t * ne = tensor->ne;
    const size_t  * nb = tensor->nb;

-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
            arg,
            ggml_type_name(tensor->type),
            ggml_op_name  (tensor->op),
            tensor->n_dims,
            ne[0], ne[1], ne[2], ne[3],
            nb[0], nb[1], nb[2], nb[3],
-            tensor->n_tasks,
            tensor->data,
            tensor->name);
 }
@ -17283,7 +17273,6 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
 //

 static enum ggml_opt_result ggml_opt_adam(
-        struct ggml_context * ctx,
        struct ggml_opt_context * opt,
        struct ggml_opt_params params,
        struct ggml_tensor * f,
@ -17291,9 +17280,6 @@ static enum ggml_opt_result ggml_opt_adam(
        struct ggml_cgraph * gb) {
    GGML_ASSERT(ggml_is_scalar(f));

-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
    // these will store the parameters we want to optimize
    struct ggml_tensor * ps[GGML_MAX_PARAMS];

@ -17340,7 +17326,18 @@ static enum ggml_opt_result ggml_opt_adam(
    // compute the function value
    ggml_graph_reset  (gf);
    ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx, gb);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, gb);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }

    opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
    opt->adam.fx_best = opt->adam.fx_prev;
@ -17420,7 +17417,18 @@ static enum ggml_opt_result ggml_opt_adam(

        ggml_graph_reset  (gf);
        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gb);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }

        const float fx = ggml_get_f32_1d(f, 0);

@ -17491,7 +17499,6 @@ struct ggml_lbfgs_iteration_data {
 };

 static enum ggml_opt_result linesearch_backtracking(
-        struct ggml_context * ctx,
        const struct ggml_opt_params * params,
        int nx,
        float * x,
@ -17542,7 +17549,18 @@ static enum ggml_opt_result linesearch_backtracking(

            ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx, gb);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params->n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, gb);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            ggml_opt_get_grad(np, ps, g);

@ -17610,9 +17628,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        }
    }

-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
    const int m = params.lbfgs.m;

    // these will store the parameters we want to optimize
@ -17664,7 +17679,17 @@ static enum ggml_opt_result ggml_opt_lbfgs(

        ggml_graph_reset  (gf);
        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gb);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }

        ggml_opt_get_grad(np, ps, g);

@ -17723,7 +17748,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        ggml_vec_cpy_f32(nx, xp, x);
        ggml_vec_cpy_f32(nx, gp, g);

-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);

        if (ls < 0) {
            // linesearch failed - go back to the previous point and return
@ -18025,7 +18050,7 @@ enum ggml_opt_result ggml_opt_resume_g(
    switch (opt->params.type) {
        case GGML_OPT_ADAM:
            {
-                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
+                result = ggml_opt_adam(opt, opt->params, f, gf, gb);
            } break;
        case GGML_OPT_LBFGS:
            {
--- a/ggml.h
+++ b/ggml.h
@ -65,7 +65,16 @@
 //       ggml_set_f32(a, 3.0f);
 //       ggml_set_f32(b, 4.0f);
 //
-//       ggml_graph_compute(ctx0, &gf);
+//       const int n_threads = 1;
+//       struct ggml_graph_compute_plan ctx = ggml_graph_compute_make_plan(&gf, n_threads);
+//       if (ctx.work_size > 0) {
+//           ctx.work_data = malloc(ctx.work_size);
+//           GGML_ASSERT(ctx.work_data);
+//       }
+//       ggml_graph_compute(&ctx, &gf);
+//       if (ctx.work_data) {
+//           free(ctx.work_data);
+//       }
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 //
@ -418,9 +427,6 @@ extern "C" {
        struct ggml_tensor * src1;
        struct ggml_tensor * opt[GGML_MAX_OPT];

-        // thread scheduling
-        int n_tasks;
-
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@ -432,27 +438,30 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[4];
+        char padding[8];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

-    // graph compute context
-    struct ggml_cgraph_context {
-        // After call to `ggml_graph_compute_plan()`, `planned` is set as true,
-        // `work_size` will be updated as non-zero when buffer is required. When
-        // need buffer, caller MUST allocate memory for `work_data`.
-        // See https://github.com/ggerganov/ggml/issues/287
+    // The default graph compute plan that needs to be prepared for ggml_graph_compute().
+    // Since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_graph_compute_plan {
+        // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
        size_t work_size;
+        // Worker buffer.
+        // Expect allocate/free by caller before/after calling to `ggml_graph_compute()`.
        void * work_data;
-        bool   planned; // true means ready to compute graph nodes.
+
+        int n_threads;
+
+        // The `n_tasks` of nodes, 1:1 mapping to cgraph nodes.
+        int n_tasks[GGML_MAX_NODES];
    };

    // computation graph
    struct ggml_cgraph {
        int n_nodes;
        int n_leafs;
-        int n_threads;

        struct ggml_tensor * nodes[GGML_MAX_NODES];
        struct ggml_tensor * grads[GGML_MAX_NODES];
@ -1305,19 +1314,10 @@ extern "C" {
    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

-    // Since https://github.com/ggerganov/ggml/issues/287
-    GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
-    // Since https://github.com/ggerganov/ggml/issues/287
-    // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
-    // Another use case: allocate buffer explicitly:
-    // - call `ggml_graph_compute_plan()`;
-    // - allocate memory for `ctx->work_data`;
-    // - finally call `ggml_graph_compute_v2()`.
-    // NOTE: don't manually set `ctx->planned`.
-    GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
-    // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
-    // See https://github.com/ggerganov/ggml/issues/287
-    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    // ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute().
+    // Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data.
+    GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/);
+    GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph);
    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);

    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
--- a/llama.cpp
+++ b/llama.cpp
@ -1309,7 +1309,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    const int actual_n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;

    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
@ -1612,10 +1612,30 @@ static bool llama_eval_internal(
            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
        }

-        ggml_graph_compute(ctx0, &gf);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
    }
 #else
-    ggml_graph_compute(ctx0, &gf);
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gf);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
 #endif

    if (cgraph_fname) {
@ -2966,8 +2986,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
            }

            struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
-            ggml_graph_compute(lora_ctx, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            // we won't need these tensors again, reset the context to save memory
            ggml_free(lora_ctx);
@ -3120,7 +3150,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {

            ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
            ggml_cgraph gf{};
-            gf.n_threads = 1;

            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kout3d->data = out;
@ -3140,7 +3169,18 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {

            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            ggml_free(cpy_ctx);
        }
@ -3226,7 +3266,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

            ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
            ggml_cgraph gf{};
-            gf.n_threads = 1;

            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
            kin3d->data = (void *) inp;
@ -3246,7 +3285,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {

            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            ggml_free(cpy_ctx);
        }
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@ -215,15 +215,36 @@ bool check_gradient(
    }

    struct ggml_cgraph gf = ggml_build_forward (f);
-    gf.n_threads = n_threads;

    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
-    gb.n_threads = n_threads;

    ggml_graph_compute(ctx0, &gf);
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gf);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
+
    ggml_graph_reset  (&gf);
    ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx0, &gb);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gb);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }

    // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
    // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
@ -236,12 +257,34 @@ bool check_gradient(
            const float xm = x0 - eps;
            const float xp = x0 + eps;
            set_element(x[i], k, xp);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            const float f0 = ggml_get_f32_1d(f, 0);

            set_element(x[i], k, xm);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            const float f1 = ggml_get_f32_1d(f, 0);

@ -252,7 +295,18 @@ bool check_gradient(
            // compute gradient using backward graph
            ggml_graph_reset  (&gf);
            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx0, &gb);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gb);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }

            const float g1 = get_element(x[i]->grad, k);

--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@ -140,7 +140,19 @@ int main(int argc, const char ** argv) {

    struct ggml_cgraph ge = ggml_build_forward(e);
    ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &ge);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
+
    const float fe = ggml_get_f32_1d(e, 0);
    printf("%s: e = %.4f\n", __func__, fe);

@ -149,7 +161,19 @@ int main(int argc, const char ** argv) {
    ggml_opt(ctx, opt_params, e);

    ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &ge);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
+
    const float fe_opt = ggml_get_f32_1d(e, 0);
    printf("%s: original  e = %.4f\n", __func__, fe);
    printf("%s: optimized e = %.4f\n", __func__, fe_opt);