ggml_graph_compute: deprecate using ggml_context, try resolve issue #287
This commit is contained in:
parent
31cfbb1013
commit
a1e7c69228
3 changed files with 86 additions and 31 deletions
|
@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
|||
|
||||
gf->n_nodes = 0;
|
||||
gf->n_leafs = 0;
|
||||
gf->work_size = 0;
|
||||
gf->perf_runs = 0;
|
||||
gf->perf_cycles = 0;
|
||||
gf->perf_time_us = 0;
|
||||
gf->work = NULL;
|
||||
|
||||
const auto & hparams = model->hparams;
|
||||
//const int n_ctx = hparams.n_ctx;
|
||||
|
|
89
ggml.c
89
ggml.c
|
@ -15773,8 +15773,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|||
/*.n_nodes =*/ 0,
|
||||
/*.n_leafs =*/ 0,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.work_size =*/ 0,
|
||||
/*.work =*/ NULL,
|
||||
/*.nodes =*/ { NULL },
|
||||
/*.grads =*/ { NULL },
|
||||
/*.leafs =*/ { NULL },
|
||||
|
@ -15946,6 +15944,7 @@ void clear_numa_thread_affinity(void) {}
|
|||
|
||||
struct ggml_compute_state_shared {
|
||||
struct ggml_cgraph * cgraph;
|
||||
struct ggml_cgraph_context * cgraph_ctx;
|
||||
|
||||
int64_t perf_node_start_cycles;
|
||||
int64_t perf_node_start_time_us;
|
||||
|
@ -15975,6 +15974,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
||||
struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
|
||||
|
||||
const int n_threads = state->shared->n_threads;
|
||||
set_numa_thread_affinity(state->ith, n_threads);
|
||||
|
@ -15989,8 +15989,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
/*.type =*/ GGML_TASK_FINALIZE,
|
||||
/*.ith =*/ 0,
|
||||
/*.nth =*/ 0,
|
||||
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
||||
/*.wsize =*/ ctx->work_size,
|
||||
/*.wdata =*/ ctx->work_data,
|
||||
};
|
||||
|
||||
if (node_n != -1) {
|
||||
|
@ -16057,8 +16057,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
/*.type =*/ GGML_TASK_COMPUTE,
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ node->n_tasks,
|
||||
/*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
|
||||
/*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
|
||||
/*.wsize =*/ ctx->work_size,
|
||||
/*.wdata =*/ ctx->work_data,
|
||||
};
|
||||
|
||||
if (state->ith < node->n_tasks) {
|
||||
|
@ -16069,23 +16069,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
const int n_threads = cgraph->n_threads;
|
||||
// Prepare for graph computing.
|
||||
// Will set: node->n_tasks, ctx->{work_size, planned}
|
||||
void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
GGML_ASSERT(ctx);
|
||||
// This function is actually reentrant, but duplicate calls is unnecessary.
|
||||
GGML_ASSERT(ctx->work_size == 0);
|
||||
GGML_ASSERT(ctx->work_data == NULL);
|
||||
GGML_ASSERT(!ctx->planned);
|
||||
|
||||
struct ggml_compute_state_shared state_shared = {
|
||||
/*.cgraph =*/ cgraph,
|
||||
/*.perf_node_start_cycles =*/ 0,
|
||||
/*.perf_node_start_time_us =*/ 0,
|
||||
/*.n_threads =*/ n_threads,
|
||||
/*.n_active =*/ n_threads,
|
||||
/*.node_n =*/ -1,
|
||||
};
|
||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||
int n_threads = cgraph->n_threads;
|
||||
size_t work_size = 0;
|
||||
|
||||
// initialize tasks + work buffer
|
||||
{
|
||||
size_t work_size = 0;
|
||||
|
||||
// thread scheduling for the different operations
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
@ -16399,19 +16396,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
} break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cgraph->work != NULL && work_size > cgraph->work_size) {
|
||||
GGML_ASSERT(false); // TODO: better handling
|
||||
}
|
||||
if (work_size > 0) {
|
||||
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
||||
}
|
||||
|
||||
if (work_size > 0 && cgraph->work == NULL) {
|
||||
cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
|
||||
ctx->work_size = work_size;
|
||||
ctx->work_data = NULL;
|
||||
ctx->planned = true;
|
||||
}
|
||||
|
||||
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
|
||||
cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
|
||||
void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
if (ctx == NULL) {
|
||||
ctx = alloca(sizeof(struct ggml_cgraph_context));
|
||||
GGML_ASSERT(ctx);
|
||||
ctx->work_size = 0;
|
||||
ctx->work_data = NULL;
|
||||
ctx->planned = false;
|
||||
} else {
|
||||
// The work_size and work_data MAY have default values even if has been planned.
|
||||
if (ctx->work_size > 0) {
|
||||
GGML_ASSERT(ctx->work_data);
|
||||
}
|
||||
}
|
||||
|
||||
if (!ctx->planned) {
|
||||
ggml_graph_compute_plan(ctx, cgraph);
|
||||
if (ctx->work_size > 0) {
|
||||
ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
|
||||
GGML_ASSERT(ctx->work_data);
|
||||
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
|
||||
}
|
||||
}
|
||||
|
||||
const int n_threads = cgraph->n_threads;
|
||||
|
||||
struct ggml_compute_state_shared state_shared = {
|
||||
/*.cgraph =*/ cgraph,
|
||||
/*.cgraph_ctx =*/ ctx,
|
||||
/*.perf_node_start_cycles =*/ 0,
|
||||
/*.perf_node_start_time_us =*/ 0,
|
||||
/*.n_threads =*/ n_threads,
|
||||
/*.n_active =*/ n_threads,
|
||||
/*.node_n =*/ -1,
|
||||
};
|
||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||
|
||||
// create thread pool
|
||||
if (n_threads > 1) {
|
||||
for (int j = 1; j < n_threads; ++j) {
|
||||
|
@ -16463,6 +16494,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
}
|
||||
}
|
||||
|
||||
// Deprecated, keep it only for backward compatibility.
|
||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
UNUSED(ctx);
|
||||
ggml_graph_compute_v2(NULL, cgraph);
|
||||
}
|
||||
|
||||
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * grad = cgraph->grads[i];
|
||||
|
|
26
ggml.h
26
ggml.h
|
@ -437,15 +437,23 @@ extern "C" {
|
|||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
||||
// graph compute context
|
||||
struct ggml_cgraph_context {
|
||||
// After call to `ggml_graph_compute_plan()`, `planned` is set as true,
|
||||
// `work_size` will be updated as non-zero when buffer is required. When
|
||||
// need buffer, caller MUST allocate memory for `work_data`.
|
||||
// See https://github.com/ggerganov/ggml/issues/287
|
||||
size_t work_size;
|
||||
void * work_data;
|
||||
bool planned; // true means ready to compute graph nodes.
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
int n_threads;
|
||||
|
||||
size_t work_size;
|
||||
struct ggml_tensor * work;
|
||||
|
||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
||||
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
||||
|
@ -1297,6 +1305,18 @@ extern "C" {
|
|||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
|
||||
// Since https://github.com/ggerganov/ggml/issues/287
|
||||
GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
|
||||
// Since https://github.com/ggerganov/ggml/issues/287
|
||||
// When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
|
||||
// Another use case: allocate buffer explicitly:
|
||||
// - call `ggml_graph_compute_plan()`;
|
||||
// - allocate memory for `ctx->work_data`;
|
||||
// - finally call `ggml_graph_compute_v2()`.
|
||||
// NOTE: don't manually set `ctx->planned`.
|
||||
GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
|
||||
// Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
|
||||
// See https://github.com/ggerganov/ggml/issues/287
|
||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue