ggml : remove ggml_cplan + rework ggml_cgraph

ggml-ci
2024-09-11 13:05:10 +03:00 · 2024-09-11 13:05:10 +03:00 · 119e0bc9ae
commit 119e0bc9ae
parent ee154457dd
10 changed files with 248 additions and 175 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -644,20 +644,6 @@ extern "C" {

    typedef struct ggml_threadpool * ggml_threadpool_t;

-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-        struct ggml_threadpool * threadpool;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@ -2047,7 +2033,6 @@ extern "C" {
    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);

-    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
@ -2065,26 +2050,72 @@ extern "C" {
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

+    // TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c
+    //       (unless the code has been moved to a separate source file)
    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params * params);
+    GGML_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);

-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
-                  const struct ggml_cgraph * cgraph,
-                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    // =================================================================================================
+    // CPU-only API for ggml_cgraph
+    //
+    // TODO: move as a separate backend
+    // NOTE: avoid using, will be removed
+    //

-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    // loops through the graph and determines:
+    //
+    // - work size needed for CPU computation
+    // - number of threads to start
+    //
+    GGML_API enum ggml_status ggml_graph_prepare(
+                struct ggml_cgraph * cgraph,
+                               int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+            struct ggml_threadpool * threadpool /* = NULL */ );
+
+    // get the estimated work size for the graph from ggml_graph_prepare()
+    GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);
+
+    // if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
+    // otherwise, the work buffer will be allocated in the context. no need to free it
+    GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
+    GGML_API void             ggml_graph_work_free(struct ggml_cgraph * cgraph);
+
+    // note: call ggml_graph_prepare() and ggml_graph_work_init() first
+    //
+    // sample usages:
+    //
+    //   - no dynamic allocations:
+    //
+    //      ... prepare ggml_context ctx ...
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, ctx);
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      // no need to call ggml_graph_work_free() because it is allocated in ctx
+    //
+    //  - dynamic allocations:
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, NULL); // will allocate memory
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      ggml_graph_work_free(cgraph);
+    //
+    GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);
+
+    // end of CPU-only API
+    // =================================================================================================
+
+    GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);

    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

@ -2107,6 +2138,7 @@ extern "C" {
            struct ggml_cgraph    * gb_tmp,
            struct ggml_tensor  * * checkpoints,
            int                     n_checkpoints);
+
    //
    // optimization
    //
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@ -751,8 +751,10 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
    GGML_UNUSED(backend);
 }

+// TODO: this struct should no longer be needed
+//       instead, the new ggml_graph_work_init() + ggml_graph_work_free() API should be enough to replace this
+//       for now, keeping the implementation as it is, to avoid making a mistake
 struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
    struct ggml_cgraph cgraph;
 };

@ -761,19 +763,19 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg

    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
    cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+    ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
-        if (cpu_plan->cplan.work_data == NULL) {
+    if (cpu_plan->cgraph.work_size > 0) {
+        cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
+        if (cpu_plan->cgraph.work_data == NULL) {
            free(cpu_plan);
            return NULL;
        }
    }

-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cpu_plan->cgraph.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;

    return cpu_plan;
 }
@ -781,7 +783,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

-    free(cpu_plan->cplan.work_data);
+    free(cpu_plan->cgraph.work_data);
    free(cpu_plan);

    GGML_UNUSED(backend);
@ -790,7 +792,7 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+    return ggml_graph_compute(&cpu_plan->cgraph);

    GGML_UNUSED(backend);
 }
@ -798,23 +800,24 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

-    if (cpu_ctx->work_size < cplan.work_size) {
+    if (cpu_ctx->work_size < cgraph->work_size) {
        free(cpu_ctx->work_data);
-        cpu_ctx->work_data = malloc(cplan.work_size);
+        cpu_ctx->work_data = malloc(cgraph->work_size);
        if (cpu_ctx->work_data == NULL) {
            cpu_ctx->work_size = 0;
            return GGML_STATUS_ALLOC_FAILED;
        }
-        cpu_ctx->work_size = cplan.work_size;
+        cpu_ctx->work_size = cgraph->work_size;
    }
-    cplan.work_data = cpu_ctx->work_data;
+    cgraph->work_data = cpu_ctx->work_data;
+    cgraph->work_own  = false; // always freed by ggml_backend_cpu_graph_plan_free

-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cgraph->abort_callback      = cpu_ctx->abort_callback;
+    cgraph->abort_callback_data = cpu_ctx->abort_callback_data;

-    return ggml_graph_compute(cgraph, &cplan);
+    return ggml_graph_compute(cgraph);
 }

 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -773,6 +773,17 @@ struct ggml_cgraph {
    struct ggml_hash_set visited_hash_set;

    enum ggml_cgraph_eval_order order;
+
+    bool      work_own;
+    size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+    uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+    int n_threads;
+    struct ggml_threadpool * threadpool;
+
+    // abort ggml_graph_compute when true
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };

 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -2001,7 +2001,6 @@ struct ggml_threadpool {
    ggml_cond_t  cond;        // cond.var for waiting for new work

    struct ggml_cgraph * cgraph;
-    struct ggml_cplan  * cplan;

    // synchronization primitives
    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
@ -19089,14 +19088,21 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
    assert(obj_size == (size_t)((char *)p - (char *)cgraph));

    *cgraph = (struct ggml_cgraph) {
-        /*.size         =*/ size,
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ nodes_ptr,
-        /*.grads        =*/ grads_ptr,
-        /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
-        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.size                =*/ size,
+        /*.n_nodes             =*/ 0,
+        /*.n_leafs             =*/ 0,
+        /*.nodes               =*/ nodes_ptr,
+        /*.grads               =*/ grads_ptr,
+        /*.leafs               =*/ leafs_ptr,
+        /*.visited_hash_set    =*/ { hash_size, hash_used, hash_keys_ptr },
+        /*.order               =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
+        /*.work_own            =*/ false,
+        /*.work_size           =*/ 0,
+        /*.work_data           =*/ NULL,
+        /*.n_threads           =*/ GGML_DEFAULT_N_THREADS,
+        /*.threadpool          =*/ NULL,
+        /*.abort_callback      =*/ NULL,
+        /*.abort_callback_data =*/ NULL,
    };

    ggml_hash_set_reset(&cgraph->visited_hash_set);
@ -19110,14 +19116,21 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {

 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
    struct ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL, NULL },
-        /*.order        =*/ cgraph0->order,
+        /*.size                =*/ 0,
+        /*.n_nodes             =*/ i1 - i0,
+        /*.n_leafs             =*/ 0,
+        /*.nodes               =*/ cgraph0->nodes + i0,
+        /*.grads               =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
+        /*.leafs               =*/ NULL,
+        /*.hash_table          =*/ { 0, NULL, NULL },
+        /*.order               =*/ cgraph0->order,
+        /*.work_own            =*/ false,
+        /*.work_size           =*/ 0,
+        /*.work_data           =*/ NULL,
+        /*.n_threads           =*/ GGML_DEFAULT_N_THREADS,
+        /*.threadpool          =*/ NULL,
+        /*.abort_callback      =*/ NULL,
+        /*.abort_callback_data =*/ NULL,
    };

    return cgraph;
@ -19753,11 +19766,10 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
 #endif
 }

-struct ggml_cplan ggml_graph_plan(
-          const struct ggml_cgraph * cgraph,
-                           int       n_threads,
-    struct ggml_threadpool * threadpool) {
-
+enum ggml_status ggml_graph_prepare(
+            struct ggml_cgraph * cgraph,
+                           int   n_threads,
+        struct ggml_threadpool * threadpool) {
    if (threadpool == NULL) {
        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
    }
@ -19767,9 +19779,6 @@ struct ggml_cplan ggml_graph_plan(

    size_t work_size = 0;

-    struct ggml_cplan cplan;
-    memset(&cplan, 0, sizeof(struct ggml_cplan));
-
    int max_tasks = 1;

    // thread scheduling for the different operations + work buffer size estimation
@ -19921,28 +19930,63 @@ struct ggml_cplan ggml_graph_plan(
        work_size += CACHE_LINE_SIZE*(n_threads);
    }

-    cplan.threadpool = threadpool;
-    cplan.n_threads  = MIN(max_tasks, n_threads);
-    cplan.work_size  = work_size;
-    cplan.work_data  = NULL;
+    cgraph->threadpool = threadpool;
+    cgraph->n_threads  = MIN(max_tasks, n_threads);
+    cgraph->work_size  = work_size;

-    return cplan;
+    ggml_graph_work_free(cgraph);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph) {
+    return cgraph->work_size;
+}
+
+enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx) {
+    GGML_ASSERT(cgraph->n_threads > 0 && "call ggml_graph_prepare first");
+
+    ggml_graph_work_free(cgraph);
+
+    if (cgraph->work_size > 0) {
+        if (ctx == NULL) {
+            cgraph->work_data = GGML_ALIGNED_MALLOC(cgraph->work_size);
+            if (cgraph->work_data == NULL) {
+                return GGML_STATUS_ALLOC_FAILED;
+            }
+
+            cgraph->work_own = true;
+        } else {
+            struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cgraph->work_size);
+
+            cgraph->work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+            cgraph->work_own  = false;
+        }
+    }
+
+    return GGML_STATUS_SUCCESS;
+}
+
+void ggml_graph_work_free(struct ggml_cgraph * cgraph) {
+    if (cgraph->work_data && cgraph->work_own) {
+        GGML_ALIGNED_FREE(cgraph->work_data);
+        cgraph->work_data = NULL;
+    }
 }

 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;

    const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
-    const struct ggml_cplan  * cplan  = state->threadpool->cplan;

    set_numa_thread_affinity(state->ith);

    struct ggml_compute_params params = {
-        /*.ith       =*/ state->ith,
-        /*.nth       =*/ state->threadpool->n_threads_cur,
-        /*.wsize     =*/ cplan->work_size,
-        /*.wdata     =*/ cplan->work_data,
-        /*.threadpool=*/ state->threadpool,
+        /*.ith        =*/ state->ith,
+        /*.nth        =*/ state->threadpool->n_threads_cur,
+        /*.wsize      =*/ cgraph->work_size,
+        /*.wdata      =*/ cgraph->work_data,
+        /*.threadpool =*/ state->threadpool,
    };

    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
@ -19950,7 +19994,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

        ggml_compute_forward(&params, node);

-        if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+        if (state->ith == 0 && cgraph->abort_callback && cgraph->abort_callback(cgraph->abort_callback_data)) {
            state->threadpool->ec = GGML_STATUS_ABORTED;
        }

@ -20104,14 +20148,12 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons

 static struct ggml_threadpool * ggml_threadpool_new_impl(
    struct ggml_threadpool_params * tpp,
-               struct ggml_cgraph * cgraph,
-                struct ggml_cplan * cplan) {
+               struct ggml_cgraph * cgraph) {

    struct ggml_threadpool * threadpool =
        GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
    {
        threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
        threadpool->n_graph          = 0;
        threadpool->n_barrier        = 0;
        threadpool->n_barrier_passed = 0;
@ -20169,16 +20211,15 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
 }

 struct ggml_threadpool * ggml_threadpool_new(struct ggml_threadpool_params * tpp) {
-    return ggml_threadpool_new_impl(tpp, NULL, NULL);
+    return ggml_threadpool_new_impl(tpp, NULL);
 }

-enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
-    GGML_ASSERT(cplan);
-    GGML_ASSERT(cplan->n_threads > 0);
-    GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
+enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph) {
+    GGML_ASSERT((cgraph->n_threads > 0                              ) && "call ggml_graph_prepare first");
+    GGML_ASSERT((cgraph->work_size == 0 || cgraph->work_data != NULL) && "call ggml_graph_work_init first");

-    int n_threads                               = cplan->n_threads;
-    struct ggml_threadpool * threadpool = cplan->threadpool;
+    int n_threads = cgraph->n_threads;
+    struct ggml_threadpool * threadpool = cgraph->threadpool;

    bool disposable_threadpool = false;

@ -20187,19 +20228,18 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        disposable_threadpool = true;

        struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads);
-        threadpool = ggml_threadpool_new_impl(&ttp, cgraph, cplan);
+        threadpool = ggml_threadpool_new_impl(&ttp, cgraph);
    } else {
        // Reset some of the parameters that need resetting
        // No worker threads should be accessing the parameters below at this stage
-        threadpool->cgraph           = cgraph;
-        threadpool->cplan            = cplan;
-        threadpool->n_threads_cur    = n_threads;
-        threadpool->current_chunk    = 0;
-        threadpool->ec               = GGML_STATUS_SUCCESS;
+        threadpool->cgraph        = cgraph;
+        threadpool->n_threads_cur = n_threads;
+        threadpool->current_chunk = 0;
+        threadpool->ec            = GGML_STATUS_SUCCESS;
    }

    if (n_threads > threadpool->n_threads_max) {
-        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+        GGML_PRINT("WARNING: cgraph is requesting more threads than the threadpool contains. Expect a bad time!\n");
    }

 #ifdef GGML_USE_OPENMP
@ -20238,14 +20278,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
    return ret;
 }

-enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
-
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
-
-    return ggml_graph_compute(cgraph, &cplan);
+void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data) {
+    cgraph->abort_callback = abort_callback;
+    cgraph->abort_callback_data = abort_data;
 }

 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
@ -21055,9 +21090,8 @@ static enum ggml_opt_result ggml_opt_adam(

    float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values

-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_prepare  (gb, params.n_threads, NULL);
+    ggml_graph_work_init(gb, ctx);

    bool cancel = false;

@ -21073,7 +21107,7 @@ static enum ggml_opt_result ggml_opt_adam(
        }
        // ggml_graph_reset  (gf);
        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(gb, &cplan);
+        ggml_graph_compute(gb);
        ggml_opt_acc_grad(np, ps, g, accum_norm);
        fx += ggml_get_f32_1d(f, 0);
    }
@ -21164,7 +21198,7 @@ static enum ggml_opt_result ggml_opt_adam(
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
+            ggml_graph_compute(gb);
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
@ -21249,7 +21283,6 @@ static enum ggml_opt_result linesearch_backtracking(
        const float * xp,
        struct ggml_tensor * f,
        struct ggml_cgraph * gb,
-        struct ggml_cplan  * cplan,
        const int np,
        struct ggml_tensor * ps[],
        bool * cancel,
@ -21306,7 +21339,7 @@ static enum ggml_opt_result linesearch_backtracking(
                }
                // ggml_graph_reset  (gf);
                ggml_set_f32      (f->grad, 1.0f);
-                ggml_graph_compute(gb, cplan);
+                ggml_graph_compute(gb);
                ggml_opt_acc_grad(np, ps, g, accum_norm);
                *fx += ggml_get_f32_1d(f, 0);
            }
@ -21402,9 +21435,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        opt->iter = iter;
    }

-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
-    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
-    cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
+    ggml_graph_prepare  (gb, params.n_threads, NULL);
+    ggml_graph_work_init(gb, ctx);

    float * x  = opt->lbfgs.x->data;  // current parameters
    float * xp = opt->lbfgs.xp->data; // previous parameters
@ -21449,7 +21481,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
            }
            // ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(gb, &cplan);
+            ggml_graph_compute(gb);
            ggml_opt_acc_grad(np, ps, g, accum_norm);
            fx += ggml_get_f32_1d(f, 0);
        }
@ -21515,7 +21547,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        //       to determine if the optimization should be cancelled
        //       this is a simple change, but not doing this atm, since I don't have a nice
        //       way to test and don't want to break something with so many changes lined up
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gb, np, ps, &cancel, callback, callback_data);
        if (cancel) {
            return GGML_OPT_RESULT_CANCEL;
        }