diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 785e7e886..5d66089b1 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1569,7 +1569,7 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
@@ -1598,12 +1598,12 @@ int main(int argc, char ** argv) {
         ggml_build_forward_expand(&gf, e);
 
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf);
+            ggml_graph_compute(&gf, &pf);
         }
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
@@ -1622,12 +1622,12 @@ int main(int argc, char ** argv) {
         ggml_build_forward_expand(&gf, e);
 
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf);
+            ggml_graph_compute(&gf, &pf);
         }
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
@@ -1683,12 +1683,12 @@ int main(int argc, char ** argv) {
             ggml_build_forward_expand(&gf, logits);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-                if (plan.work_size > 0) {
-                    compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = compute_plan_buffer.data();
+                struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+                if (pf.work_size > 0) {
+                    work_buffer.resize(pf.work_size);
+                    pf.work_data = work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index e7d75c9ae..840f4fe52 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -164,15 +164,15 @@ int main(int argc, char ** argv)  {
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     {
-        auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
-        if (plan.work_size > 0) {
-            compute_plan_buffer.resize(plan.work_size);
-            plan.work_data = compute_plan_buffer.data();
+        ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads);
+        if (pf.work_size > 0) {
+            work_buffer.resize(pf.work_size);
+            pf.work_data = work_buffer.data();
         }
-        ggml_graph_compute(&plan, &gf);
+        ggml_graph_compute(&gf, &pf);
     }
 
     TENSOR_DUMP(gf.nodes[0]);
@@ -228,12 +228,12 @@ int main(int argc, char ** argv)  {
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
         {
-            auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads);
+            if (pf31.work_size > 0) {
+                work_buffer.resize(pf31.work_size);
+                pf31.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf31);
+            ggml_graph_compute(&gf31, &pf31);
         }
 
         long long int stop = ggml_time_us();
@@ -268,12 +268,12 @@ int main(int argc, char ** argv)  {
 
         // Running a different graph computation to make sure we override the CPU cache lines
         {
-            auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads);
+            if (pf32.work_size > 0) {
+                work_buffer.resize(pf32.work_size);
+                pf32.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf32);
+            ggml_graph_compute(&gf32, &pf32);
         }
     }
     printf("\n");
diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
index cdfe4bfe9..7438defde 100644
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
     struct ggml_context * ctx_eval = NULL;
 
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-    gf.n_threads = 1;
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_metal = ggml_metal_init();
+    auto * ctx_metal = ggml_metal_init(1);
 
     const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
     const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0345b8dc0..11ffbe2e1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3160,6 +3160,7 @@ int main(int argc, char ** argv) {
     printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
     // ggml_print_tensor_objects(model.ctx);
 
+    // TODO: use std::vector<uint8_t> intead of "new"
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
     uint8_t * compute_addr = new uint8_t[compute_size];
 
@@ -3181,7 +3182,7 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     printf("%s: begin training\n", __func__);
 
@@ -3246,12 +3247,12 @@ int main(int argc, char ** argv) {
         }
 
         {
-            auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, gf);
+            ggml_graph_compute(gf, &pf);
         }
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
@@ -3277,12 +3278,12 @@ int main(int argc, char ** argv) {
         model.train_tokens  += n_batch * n_tokens;
 
         {
-            auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, gf);
+            ggml_graph_compute(gf, &pf);
         }
 
         float error_after_opt = ggml_get_f32_1d(loss, 0);
@@ -3372,12 +3373,12 @@ int main(int argc, char ** argv) {
             ggml_build_forward_expand(&gf, logits);
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
-                if (plan.work_size > 0) {
-                    compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads);
+                if (pf.work_size > 0) {
+                    work_buffer.resize(pf.work_size);
+                    pf.work_data = work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
@@ -3404,6 +3405,7 @@ int main(int argc, char ** argv) {
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;
+
     llama_free(lctx);
     llama_free_model(lmodel);
     ggml_free(model.ctx);
diff --git a/ggml-metal.h b/ggml-metal.h
index b9e50ac74..928f1705c 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -34,9 +34,13 @@ extern "C" {
 
 struct ggml_metal_context;
 
-struct ggml_metal_context * ggml_metal_init(void);
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
 
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
 // creates a mapping between a host memory buffer and a device memory buffer
 // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
 // - the mapping is used during computation to determine the arguments of the compute kernels
diff --git a/ggml-metal.m b/ggml-metal.m
index fd69c41fe..3f15f791f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -25,6 +25,8 @@ struct ggml_metal_buffer {
 };
 
 struct ggml_metal_context {
+    int n_cb;
+
     float * logits;
 
     id<MTLDevice>       device;
@@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
 @implementation GGMLMetalClass
 @end
 
-struct ggml_metal_context * ggml_metal_init(void) {
+struct ggml_metal_context * ggml_metal_init(int n_cb) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
     struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 
+    ctx->n_cb   = n_cb;
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
@@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     free(ctx);
 }
 
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
+    ctx->n_cb = n_cb;
+}
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
-    const int n_cb = gf->n_threads;
+    const int n_cb = ctx->n_cb;
 
     NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
 
diff --git a/ggml.c b/ggml.c
index 94a710706..23938fc5f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15942,7 +15942,7 @@ void clear_numa_thread_affinity(void) {}
 
 struct ggml_compute_state_shared {
     const struct ggml_cgraph * cgraph;
-    const struct ggml_graph_compute_plan * plan;
+    const struct ggml_cplan  * cplan;
 
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
@@ -15971,12 +15971,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+
     const struct ggml_cgraph * cgraph = state->shared->cgraph;
+    const struct ggml_cplan  * cplan  = state->shared->cplan;
 
-    const struct ggml_graph_compute_plan * plan = state->shared->plan;
-    const int * n_tasks_arr = plan->n_tasks;
+    const int * n_tasks_arr = cplan->n_tasks;
+    const int   n_threads   = state->shared->n_threads;
 
-    const int n_threads = state->shared->n_threads;
     set_numa_thread_affinity(state->ith, n_threads);
 
     int node_n = -1;
@@ -15989,8 +15990,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /*.type  =*/ GGML_TASK_FINALIZE,
                 /*.ith   =*/ 0,
                 /*.nth   =*/ 0,
-                /*.wsize =*/ plan->work_size,
-                /*.wdata =*/ plan->work_data,
+                /*.wsize =*/ cplan->work_size,
+                /*.wdata =*/ cplan->work_data,
             };
 
             if (node_n != -1) {
@@ -16059,8 +16060,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
             /*.nth   =*/ n_tasks,
-            /*.wsize =*/ plan->work_size,
-            /*.wdata =*/ plan->work_data,
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
         };
 
         if (state->ith < n_tasks) {
@@ -16072,14 +16073,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 
 // Prepare for graph computing.
-struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) {
+struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
 
-    struct ggml_graph_compute_plan plan;
-    memset(&plan, 0, sizeof(struct ggml_graph_compute_plan));
-    int * n_tasks = plan.n_tasks;
+    struct ggml_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_cplan));
+
+    int * n_tasks = cplan.n_tasks;
+
     size_t work_size = 0;
 
     // initialize tasks + work buffer
@@ -16403,34 +16406,34 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph *
         work_size += CACHE_LINE_SIZE*(n_threads - 1);
     }
 
-    plan.n_threads = n_threads;
-    plan.work_size = work_size;
-    plan.work_data = NULL;
+    cplan.n_threads = n_threads;
+    cplan.work_size = work_size;
+    cplan.work_data = NULL;
 
-    return plan;
+    return cplan;
 }
 
-void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) {
+void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     {
-        GGML_ASSERT(plan);
-        GGML_ASSERT(plan->n_threads > 0);
+        GGML_ASSERT(cplan);
+        GGML_ASSERT(cplan->n_threads > 0);
 
-        if (plan->work_size > 0) {
-            GGML_ASSERT(plan->work_data);
+        if (cplan->work_size > 0) {
+            GGML_ASSERT(cplan->work_data);
         }
 
         for (int i = 0; i < cgraph->n_nodes; ++i) {
             if (cgraph->nodes[i]->op != GGML_OP_NONE) {
-                GGML_ASSERT(plan->n_tasks[i] > 0);
+                GGML_ASSERT(cplan->n_tasks[i] > 0);
             }
         }
     }
 
-    const int n_threads = plan->n_threads;
+    const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ plan,
+        /*.cgraph_plan             =*/ cplan,
         /*.perf_node_start_cycles  =*/ 0,
         /*.perf_node_start_time_us =*/ 0,
         /*.n_threads               =*/ n_threads,
@@ -16491,17 +16494,19 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
 }
 
 // TODO: avoid allocating memory frequently.
-static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
-    if (plan.work_size > 0) {
-        plan.work_data = malloc(plan.work_size);
-        GGML_ASSERT(plan.work_data);
+// TODO: make part of public API - use different name and put warning that it makes allocations
+static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+
+    if (cplan.work_size > 0) {
+        cplan.work_data = malloc(cplan.work_size);
+        GGML_ASSERT(cplan.work_data);
     }
 
-    ggml_graph_compute(&plan, cgraph);
+    ggml_graph_compute(cgraph, &cplan);
 
-    if (plan.work_data) {
-        free(plan.work_data);
+    if (cplan.work_data) {
+        free(cplan.work_data);
     }
 }
 
@@ -17341,7 +17346,7 @@ static enum ggml_opt_result ggml_opt_adam(
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    ggml_graph_compute_sugar(gb, params.n_threads);
+    ggml_graph_compute_helper(gb, params.n_threads);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17422,7 +17427,7 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_sugar(gb, params.n_threads);
+        ggml_graph_compute_helper(gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
 
@@ -17544,7 +17549,7 @@ static enum ggml_opt_result linesearch_backtracking(
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            ggml_graph_compute_sugar(gb, params->n_threads);
+            ggml_graph_compute_helper(gb, params->n_threads);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17664,7 +17669,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_sugar(gb, params.n_threads);
+        ggml_graph_compute_helper(gb, params.n_threads);
 
         ggml_opt_get_grad(np, ps, g);
 
diff --git a/ggml.h b/ggml.h
index 1b50ab866..901c701ea 100644
--- a/ggml.h
+++ b/ggml.h
@@ -443,17 +443,15 @@ extern "C" {
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
-    // The default graph compute plan that needs to be prepared for ggml_graph_compute().
-    // Since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_graph_compute_plan {
-        // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
-        size_t work_size;
-        // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
-        uint8_t * work_data;
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
         int n_threads;
 
-        // The `n_tasks` of nodes, 1:1 mapping to cgraph nodes.
+        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
         int n_tasks[GGML_MAX_NODES];
     };
 
@@ -1313,11 +1311,11 @@ extern "C" {
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
-    // ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute().
-    // Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data.
-    GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/);
-    GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph);
-    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
diff --git a/llama.cpp b/llama.cpp
index c29d46d8d..e68beb7c5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -321,9 +321,8 @@ struct llama_context {
     // input embedding (1-dimensional array: [n_embd])
     std::vector<float> embedding;
 
-    // reusable buffer for `struct ggml_graph_compute_plan.work_data`
-    // std::vector guarantees the elements are stored contiguously.
-    std::vector<uint8_t> compute_plan_buffer;
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
 
     // memory buffers used to evaluate the model
     // TODO: move in llama_state
@@ -1599,6 +1598,7 @@ static bool llama_eval_internal(
 
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
+        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);
         call_ggml_graph_compute = false;
@@ -1622,12 +1622,12 @@ static bool llama_eval_internal(
 #endif
 
     if (call_ggml_graph_compute) {
-        auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
-        if (plan.work_size > 0) {
-            lctx.compute_plan_buffer.resize(plan.work_size);
-            plan.work_data = lctx.compute_plan_buffer.data();
+        ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads);
+        if (pf.work_size > 0) {
+            lctx.work_buffer.resize(pf.work_size);
+            pf.work_data = lctx.work_buffer.data();
         }
-        ggml_graph_compute(&plan, &gf);
+        ggml_graph_compute(&gf, &pf);
     }
 
     if (cgraph_fname) {
@@ -2587,8 +2587,8 @@ void llama_free_model(struct llama_model * model) {
 }
 
 struct llama_context * llama_new_context_with_model(
-                             struct llama_model * model,
-            struct llama_context_params   params) {
+                 struct llama_model * model,
+        struct llama_context_params   params) {
 
     if (!model) {
         return nullptr;
@@ -2657,7 +2657,7 @@ struct llama_context * llama_new_context_with_model(
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        ctx->ctx_metal = ggml_metal_init();
+        ctx->ctx_metal = ggml_metal_init(1);
 
         void * data_ptr  = NULL;
         size_t data_size = 0;
@@ -2815,7 +2815,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     bool warned = false;
     int n_tensors = 0;
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     while (true) {
         int32_t n_dims;
@@ -2983,12 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             struct ggml_cgraph gf = ggml_build_forward(r);
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                if (plan.work_size > 0) {
-                    compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+                if (pf.work_size > 0) {
+                    work_buffer.resize(pf.work_size);
+                    pf.work_data = work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             // we won't need these tensors again, reset the context to save memory
@@ -3163,12 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-                if (plan.work_size > 0) {
-                    ctx->compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = ctx->compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+                if (pf.work_size > 0) {
+                    ctx->work_buffer.resize(pf.work_size);
+                    pf.work_data = ctx->work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             ggml_free(cpy_ctx);
@@ -3276,12 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-                if (plan.work_size > 0) {
-                    ctx->compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = ctx->compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+                if (pf.work_size > 0) {
+                    ctx->work_buffer.resize(pf.work_size);
+                    pf.work_data = ctx->work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             ggml_free(cpy_ctx);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4171c126c..dd989c5c0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-# llama_add_test(test-grad0.c) # SLOW
-# llama_add_test(test-opt.c) # SLOW
+llama_add_test(test-grad0.c) # SLOW
+llama_add_test(test-opt.c) # SLOW
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 548547727..9c27e603e 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -10,6 +10,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
 #define MAX_NARGS 3
 
 #undef MIN
@@ -49,7 +51,7 @@ float frand(void) {
 
 int irand(int n) {
     if (n == 0) return 0;
-    else return rand()%n;
+    return rand()%n;
 }
 
 void get_random_dims(int64_t * dims, int ndims) {
@@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int(
 float get_element(const struct ggml_tensor * t, int idx) {
     if (t->type == GGML_TYPE_F32) {
         return ((float *)t->data)[idx];
-    } else if (t->type == GGML_TYPE_I32) {
-        return ((int32_t *)t->data)[idx];
-    } else {
-        assert(false);
-        return INFINITY;
     }
+
+    if (t->type == GGML_TYPE_I32) {
+        return ((int32_t *)t->data)[idx];
+    }
+
+    assert(false);
+    return INFINITY;
 }
 
 void set_element(struct ggml_tensor * t, int idx, float value) {
@@ -191,12 +195,12 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
 
 }
 
-struct compute_plan_buffer {
+struct work_buffer {
     size_t    size;
     uint8_t * data;
 };
 
-static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
+static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
     if (size == 0) {
         return NULL;
     }
@@ -241,20 +245,19 @@ bool check_gradient(
     }
 
     struct ggml_cgraph gf = ggml_build_forward (f);
-
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
 
-    struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
+    struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
 
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-        if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
+        struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+        if (pf.work_size > 0) {
+            pf.work_data = malloc(pf.work_size);
+            GGML_ASSERT(pf.work_data);
         }
-        ggml_graph_compute(&plan, &gf);
-        if (plan.work_data) {
-            free(plan.work_data);
+        ggml_graph_compute(&gf, &pf);
+        if (pf.work_data) {
+            free(pf.work_data);
         }
     }
 
@@ -262,9 +265,9 @@ bool check_gradient(
     ggml_set_f32      (f->grad, 1.0f);
 
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
-        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-        ggml_graph_compute(&plan, &gb);
+        struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
+        pf.work_data = work_buffer_resize(&buf, pf.work_size);
+        ggml_graph_compute(&gf, &pf);
     }
 
     // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
@@ -280,9 +283,9 @@ bool check_gradient(
             set_element(x[i], k, xp);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-                ggml_graph_compute(&plan, &gf);
+                struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+                pf.work_data = work_buffer_resize(&buf, pf.work_size);
+                ggml_graph_compute(&gf, &pf);
             }
 
             const float f0 = ggml_get_f32_1d(f, 0);
@@ -290,9 +293,9 @@ bool check_gradient(
             set_element(x[i], k, xm);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-                ggml_graph_compute(&plan, &gf);
+                struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+                pf.work_data = work_buffer_resize(&buf, pf.work_size);
+                ggml_graph_compute(&gf, &pf);
             }
 
             const float f1 = ggml_get_f32_1d(f, 0);
@@ -306,15 +309,15 @@ bool check_gradient(
             ggml_set_f32      (f->grad, 1.0f);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
-                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-                ggml_graph_compute(&plan, &gb);
+                struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
+                pf.work_data = work_buffer_resize(&buf, pf.work_size);
+                ggml_graph_compute(&gf, &pf);
             }
 
             const float g1 = get_element(x[i]->grad, k);
 
             const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
+            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
 
             if (error_abs > max_error_abs || error_rel > max_error_rel) {
                 printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
@@ -325,8 +328,8 @@ bool check_gradient(
         }
     }
 
-    if (plan_buf.data) {
-        free(plan_buf.data);
+    if (buf.data) {
+        free(buf.data);
     }
 
     return true;
diff --git a/tests/test-opt.c b/tests/test-opt.c
index 35d070dc7..3ed246b3b 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -7,6 +7,7 @@
 
 #define MAX_NARGS 2
 
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
 
 //
 // logging
@@ -33,7 +34,7 @@
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 
 
-float frand() {
+float frand(void) {
     return (float)rand()/(float)RAND_MAX;
 }
 
@@ -115,12 +116,12 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
 }
 
 
-struct compute_plan_buffer {
+struct work_buffer {
     size_t    size;
     uint8_t * data;
 };
 
-static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
+static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
     if (size == 0) {
         return NULL;
     }
@@ -139,7 +140,7 @@ static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t s
     return buf->data;
 }
 
-int main(int argc, const char ** argv) {
+int main(void) {
     struct ggml_init_params params = {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
@@ -166,11 +167,11 @@ int main(int argc, const char ** argv) {
     struct ggml_cgraph ge = ggml_build_forward(e);
     ggml_graph_reset  (&ge);
 
-    struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
+    struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
-        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-        ggml_graph_compute(&plan, &ge);
+        struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
+        pe.work_data = work_buffer_resize(&buf, pe.work_size);
+        ggml_graph_compute(&ge, &pe);
     }
 
     const float fe = ggml_get_f32_1d(e, 0);
@@ -183,13 +184,13 @@ int main(int argc, const char ** argv) {
     ggml_graph_reset  (&ge);
 
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
-        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-        ggml_graph_compute(&plan, &ge);
+        struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
+        pe.work_data = work_buffer_resize(&buf, pe.work_size);
+        ggml_graph_compute(&ge, &pe);
     }
 
-    if (plan_buf.data) {
-        free(plan_buf.data);
+    if (buf.data) {
+        free(buf.data);
     }
 
     const float fe_opt = ggml_get_f32_1d(e, 0);