From a1e7c6922898c013ac67e0fa517531d8367da841 Mon Sep 17 00:00:00 2001 From: mqy Date: Tue, 27 Jun 2023 05:47:08 +0800 Subject: [PATCH] ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 --- .../train-text-from-scratch.cpp | 2 - ggml.c | 89 +++++++++++++------ ggml.h | 26 +++++- 3 files changed, 86 insertions(+), 31 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index c50eeb343..7f7bf3b6f 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( gf->n_nodes = 0; gf->n_leafs = 0; - gf->work_size = 0; gf->perf_runs = 0; gf->perf_cycles = 0; gf->perf_time_us = 0; - gf->work = NULL; const auto & hparams = model->hparams; //const int n_ctx = hparams.n_ctx; diff --git a/ggml.c b/ggml.c index d257c3d65..003506600 100644 --- a/ggml.c +++ b/ggml.c @@ -15773,8 +15773,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -15946,6 +15944,7 @@ void clear_numa_thread_affinity(void) {} struct ggml_compute_state_shared { struct ggml_cgraph * cgraph; + struct ggml_cgraph_context * cgraph_ctx; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; @@ -15975,6 +15974,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_cgraph * cgraph = state->shared->cgraph; + struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -15989,8 +15989,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ ctx->work_size, + /*.wdata =*/ ctx->work_data, }; if (node_n != -1) { @@ -16057,8 +16057,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ ctx->work_size, + /*.wdata =*/ ctx->work_data, }; if (state->ith < node->n_tasks) { @@ -16069,23 +16069,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; +// Prepare for graph computing. +// Will set: node->n_tasks, ctx->{work_size, planned} +void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { + GGML_ASSERT(ctx); + // This function is actually reentrant, but duplicate calls is unnecessary. + GGML_ASSERT(ctx->work_size == 0); + GGML_ASSERT(ctx->work_data == NULL); + GGML_ASSERT(!ctx->planned); - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + int n_threads = cgraph->n_threads; + size_t work_size = 0; // initialize tasks + work buffer { - size_t work_size = 0; - // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -16399,19 +16396,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; } } + } - if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling - } + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); + ctx->work_size = work_size; + ctx->work_data = NULL; + ctx->planned = true; +} - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); +void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { + if (ctx == NULL) { + ctx = alloca(sizeof(struct ggml_cgraph_context)); + GGML_ASSERT(ctx); + ctx->work_size = 0; + ctx->work_data = NULL; + ctx->planned = false; + } else { + // The work_size and work_data MAY have default values even if has been planned. + if (ctx->work_size > 0) { + GGML_ASSERT(ctx->work_data); } } + if (!ctx->planned) { + ggml_graph_compute_plan(ctx, cgraph); + if (ctx->work_size > 0) { + ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8)); + GGML_ASSERT(ctx->work_data); + GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size); + } + } + + const int n_threads = cgraph->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_ctx =*/ ctx, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { @@ -16463,6 +16494,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } } +// Deprecated, keep it only for backward compatibility. +void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { + UNUSED(ctx); + ggml_graph_compute_v2(NULL, cgraph); +} + void ggml_graph_reset(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * grad = cgraph->grads[i]; diff --git a/ggml.h b/ggml.h index 24ca8ae22..f949fe35f 100644 --- a/ggml.h +++ b/ggml.h @@ -437,15 +437,23 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // graph compute context + struct ggml_cgraph_context { + // After call to `ggml_graph_compute_plan()`, `planned` is set as true, + // `work_size` will be updated as non-zero when buffer is required. When + // need buffer, caller MUST allocate memory for `work_data`. + // See https://github.com/ggerganov/ggml/issues/287 + size_t work_size; + void * work_data; + bool planned; // true means ready to compute graph nodes. + }; + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; int n_threads; - size_t work_size; - struct ggml_tensor * work; - struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; struct ggml_tensor * leafs[GGML_MAX_NODES]; @@ -1297,6 +1305,18 @@ extern "C" { GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); + // Since https://github.com/ggerganov/ggml/issues/287 + GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); + // Since https://github.com/ggerganov/ggml/issues/287 + // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`. + // Another use case: allocate buffer explicitly: + // - call `ggml_graph_compute_plan()`; + // - allocate memory for `ctx->work_data`; + // - finally call `ggml_graph_compute_v2()`. + // NOTE: don't manually set `ctx->planned`. + GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); + // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead. + // See https://github.com/ggerganov/ggml/issues/287 GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);