rewrite: no longer consider backward compitability; plan and make_plan
This commit is contained in:
parent
a1e7c69228
commit
b11ac01f6b
8 changed files with 404 additions and 165 deletions
|
@ -1586,7 +1586,6 @@ int main(int argc, char ** argv) {
|
|||
int n_past = 0;
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = 1;
|
||||
|
||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||
|
||||
|
@ -1595,7 +1594,18 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
||||
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
|
@ -1611,7 +1621,18 @@ int main(int argc, char ** argv) {
|
|||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||
//
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
|
@ -1659,13 +1680,23 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = 1;
|
||||
|
||||
int n_past = 0;
|
||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||
|
||||
ggml_build_forward_expand(&gf, logits);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||
|
|
|
@ -159,13 +159,22 @@ int main(int argc, char ** argv) {
|
|||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
||||
|
||||
gf.n_threads=benchmark_params.n_threads;
|
||||
printf("cgraph->n_threads=%i\n",gf.n_threads);
|
||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||
|
||||
TENSOR_DUMP(m11);
|
||||
TENSOR_DUMP(m2);
|
||||
|
||||
ggml_graph_compute(ctx, &gf);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
TENSOR_DUMP(gf.nodes[0]);
|
||||
|
||||
|
@ -187,7 +196,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
||||
gf31.n_threads=benchmark_params.n_threads;
|
||||
|
||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||
// printf("Creating new tensor q12 & Running quantize\n");
|
||||
|
@ -199,8 +207,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
//printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
||||
gf32.n_threads=benchmark_params.n_threads;
|
||||
printf("cgraph->n_threads=%i\n",gf31.n_threads);
|
||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||
|
||||
const int dimx = sizex;
|
||||
const int dimy = sizey;
|
||||
|
@ -221,14 +228,25 @@ int main(int argc, char ** argv) {
|
|||
|
||||
long long int start = ggml_time_us();
|
||||
//printf("Running ggml_graph_compute\n");
|
||||
ggml_graph_compute(ctx, &gf31);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf31);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
long long int stop = ggml_time_us();
|
||||
long long int usec = stop-start;
|
||||
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
||||
gflops_sum += gflops;
|
||||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
||||
i,
|
||||
gf31.n_threads,
|
||||
benchmark_params.n_threads,
|
||||
sizex, sizey, sizez, flops_per_matrix,
|
||||
usec,gflops);
|
||||
|
||||
|
@ -253,7 +271,17 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// Running a different graph computation to make sure we override the CPU cache lines
|
||||
ggml_graph_compute(ctx, &gf32);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf32);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||
|
|
|
@ -3215,9 +3215,6 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
||||
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
||||
|
||||
// ggml_cgraph gf = {};
|
||||
gf->n_threads = params.n_threads;
|
||||
gb->n_threads = params.n_threads;
|
||||
|
||||
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
|
||||
|
||||
|
@ -3246,7 +3243,17 @@ int main(int argc, char ** argv) {
|
|||
*gb = ggml_build_backward(ctx0, gf, true);
|
||||
}
|
||||
|
||||
ggml_graph_compute(ctx0, gf);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
||||
|
||||
|
@ -3270,7 +3277,17 @@ int main(int argc, char ** argv) {
|
|||
model.train_samples += n_batch;
|
||||
model.train_tokens += n_batch * n_tokens;
|
||||
|
||||
ggml_graph_compute(ctx0, gf);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
||||
|
||||
|
@ -3352,13 +3369,23 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_context * ctx0 = ggml_init(cparams);
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = params.n_threads;
|
||||
|
||||
int n_past = 0;
|
||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||
|
||||
ggml_build_forward_expand(&gf, logits);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||
|
|
227
ggml.c
227
ggml.c
|
@ -4583,14 +4583,13 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
|||
/*.src0 =*/ NULL,
|
||||
/*.src1 =*/ NULL,
|
||||
/*.opt =*/ { NULL },
|
||||
/*.n_tasks =*/ 0,
|
||||
/*.perf_runs =*/ 0,
|
||||
/*.perf_cycles =*/ 0,
|
||||
/*.perf_time_us =*/ 0,
|
||||
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
||||
/*.name =*/ { 0 },
|
||||
/*.extra =*/ NULL,
|
||||
/*.pad =*/ { 0 },
|
||||
/*.padding =*/ { 0 },
|
||||
};
|
||||
|
||||
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
||||
|
@ -15772,7 +15771,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|||
struct ggml_cgraph result = {
|
||||
/*.n_nodes =*/ 0,
|
||||
/*.n_leafs =*/ 0,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.nodes =*/ { NULL },
|
||||
/*.grads =*/ { NULL },
|
||||
/*.leafs =*/ { NULL },
|
||||
|
@ -15944,7 +15942,7 @@ void clear_numa_thread_affinity(void) {}
|
|||
|
||||
struct ggml_compute_state_shared {
|
||||
struct ggml_cgraph * cgraph;
|
||||
struct ggml_cgraph_context * cgraph_ctx;
|
||||
struct ggml_graph_compute_plan * cgraph_ctx;
|
||||
|
||||
int64_t perf_node_start_cycles;
|
||||
int64_t perf_node_start_time_us;
|
||||
|
@ -15974,7 +15972,9 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
|
|||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||
struct ggml_cgraph * cgraph = state->shared->cgraph;
|
||||
struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
|
||||
|
||||
struct ggml_graph_compute_plan * ctx = state->shared->cgraph_ctx;
|
||||
const int *n_tasks_arr = ctx->n_tasks;
|
||||
|
||||
const int n_threads = state->shared->n_threads;
|
||||
set_numa_thread_affinity(state->ith, n_threads);
|
||||
|
@ -15997,7 +15997,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
/* FINALIZE */
|
||||
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.nth = node->n_tasks;
|
||||
params.nth = n_tasks_arr[node_n];
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
|
@ -16008,11 +16008,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
||||
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = n_tasks_arr[node_n];
|
||||
|
||||
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
||||
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
||||
|
||||
params.nth = node->n_tasks;
|
||||
params.nth = n_tasks;
|
||||
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
|
@ -16020,7 +16021,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
if (node->n_tasks == 1) {
|
||||
if (n_tasks == 1) {
|
||||
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
||||
// they do something more efficient than spinning (?)
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
|
@ -16052,16 +16053,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
/* COMPUTE */
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = n_tasks_arr[node_n];
|
||||
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_COMPUTE,
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ node->n_tasks,
|
||||
/*.nth =*/ n_tasks,
|
||||
/*.wsize =*/ ctx->work_size,
|
||||
/*.wdata =*/ ctx->work_data,
|
||||
};
|
||||
|
||||
if (state->ith < node->n_tasks) {
|
||||
if (state->ith < n_tasks) {
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
}
|
||||
|
@ -16070,15 +16072,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
}
|
||||
|
||||
// Prepare for graph computing.
|
||||
// Will set: node->n_tasks, ctx->{work_size, planned}
|
||||
void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
GGML_ASSERT(ctx);
|
||||
// This function is actually reentrant, but duplicate calls is unnecessary.
|
||||
GGML_ASSERT(ctx->work_size == 0);
|
||||
GGML_ASSERT(ctx->work_data == NULL);
|
||||
GGML_ASSERT(!ctx->planned);
|
||||
struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||
if (n_threads <= 0) {
|
||||
n_threads = GGML_DEFAULT_N_THREADS;
|
||||
}
|
||||
|
||||
int n_threads = cgraph->n_threads;
|
||||
struct ggml_graph_compute_plan ctx;
|
||||
memset(&ctx, 0, sizeof(struct ggml_graph_compute_plan));
|
||||
int * n_tasks = ctx.n_tasks;
|
||||
size_t work_size = 0;
|
||||
|
||||
// initialize tasks + work buffer
|
||||
|
@ -16091,11 +16092,11 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = 0;
|
||||
if (ggml_is_quantized(node->type)) {
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks[i];
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
|
@ -16103,24 +16104,24 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD1:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
if (ggml_is_quantized(node->src0->type)) {
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks[i];
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
if (ggml_is_quantized(node->src0->type)) {
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks[i];
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
|
@ -16144,7 +16145,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
case GGML_OP_ELU:
|
||||
case GGML_OP_RELU:
|
||||
{
|
||||
node->n_tasks = 1;
|
||||
n_tasks[i] = 1;
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_GELU:
|
||||
|
@ -16155,32 +16156,32 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
// TODO: use different scheduling for different matrix sizes
|
||||
//const int nr0 = ggml_nrows(node->src0);
|
||||
//const int nr1 = ggml_nrows(node->src1);
|
||||
|
||||
//node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
|
||||
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
|
||||
//n_tasks[i] = MIN(n_threads, MAX(1, nr0/128));
|
||||
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, n_tasks[i]);
|
||||
|
||||
size_t cur = 0;
|
||||
const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
n_tasks[i] = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
}
|
||||
else
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
n_tasks[i] = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
}
|
||||
|
@ -16188,7 +16189,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
#endif
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
n_tasks[i] = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
if (node->src0->type != GGML_TYPE_F32) {
|
||||
// here we need memory just for single 2D matrix from src0
|
||||
|
@ -16206,7 +16207,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
node->n_tasks = 1;
|
||||
n_tasks[i] = 1;
|
||||
} break;
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_CONT:
|
||||
|
@ -16219,7 +16220,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
case GGML_OP_DIAG:
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
{
|
||||
node->n_tasks = 1;
|
||||
n_tasks[i] = 1;
|
||||
} break;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
|
@ -16227,19 +16228,19 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ROPE_BACK:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
} break;
|
||||
case GGML_OP_ALIBI:
|
||||
{
|
||||
node->n_tasks = 1; //TODO
|
||||
n_tasks[i] = 1; //TODO
|
||||
} break;
|
||||
case GGML_OP_CLAMP:
|
||||
{
|
||||
node->n_tasks = 1; //TODO
|
||||
n_tasks[i] = 1; //TODO
|
||||
} break;
|
||||
case GGML_OP_CONV_1D:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
GGML_ASSERT(node->src0->ne[3] == 1);
|
||||
GGML_ASSERT(node->src1->ne[2] == 1);
|
||||
|
@ -16268,7 +16269,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
GGML_ASSERT(node->src1->ne[3] == 1);
|
||||
|
||||
|
@ -16303,45 +16304,45 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
||||
cur = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
|
||||
cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
|
||||
}
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
|
||||
cur = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
|
||||
cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
|
||||
cur = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
|
||||
}
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
|
||||
cur = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
|
||||
cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
|
@ -16349,13 +16350,13 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||
const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
|
||||
if (node->src1->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
|
||||
cur = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
|
||||
cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
|
||||
}
|
||||
|
||||
if (node->src1->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
|
||||
cur = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
|
||||
cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
|
||||
}
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
|
@ -16368,27 +16369,27 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
case GGML_OP_MAP_CUSTOM2:
|
||||
case GGML_OP_MAP_CUSTOM3:
|
||||
{
|
||||
node->n_tasks = 1;
|
||||
n_tasks[i] = 1;
|
||||
} break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
|
||||
size_t cur = ggml_type_size(node->type)*(n_tasks[i] + node->src0->ne[0]*n_tasks[i]);
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
{
|
||||
node->n_tasks = n_threads;
|
||||
n_tasks[i] = n_threads;
|
||||
|
||||
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
|
||||
size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks[i];
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
node->n_tasks = 1;
|
||||
n_tasks[i] = 1;
|
||||
} break;
|
||||
case GGML_OP_COUNT:
|
||||
{
|
||||
|
@ -16402,35 +16403,31 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
|
|||
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
||||
}
|
||||
|
||||
ctx->work_size = work_size;
|
||||
ctx->work_data = NULL;
|
||||
ctx->planned = true;
|
||||
ctx.n_threads = n_threads;
|
||||
ctx.work_size = work_size;
|
||||
ctx.work_data = NULL;
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
if (ctx == NULL) {
|
||||
ctx = alloca(sizeof(struct ggml_cgraph_context));
|
||||
void ggml_graph_compute(struct ggml_graph_compute_plan * ctx, struct ggml_cgraph * cgraph) {
|
||||
{
|
||||
GGML_ASSERT(ctx);
|
||||
ctx->work_size = 0;
|
||||
ctx->work_data = NULL;
|
||||
ctx->planned = false;
|
||||
} else {
|
||||
// The work_size and work_data MAY have default values even if has been planned.
|
||||
GGML_ASSERT(ctx->n_threads > 0);
|
||||
|
||||
if (ctx->work_size > 0) {
|
||||
GGML_ASSERT(ctx->work_data);
|
||||
}
|
||||
}
|
||||
|
||||
if (!ctx->planned) {
|
||||
ggml_graph_compute_plan(ctx, cgraph);
|
||||
if (ctx->work_size > 0) {
|
||||
ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
|
||||
GGML_ASSERT(ctx->work_data);
|
||||
GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
if (cgraph->nodes[i]->op != GGML_OP_NONE) {
|
||||
GGML_ASSERT(ctx->n_tasks[i] > 0);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const int n_threads = cgraph->n_threads;
|
||||
const int n_threads = ctx->n_threads;
|
||||
|
||||
struct ggml_compute_state_shared state_shared = {
|
||||
/*.cgraph =*/ cgraph,
|
||||
|
@ -16494,12 +16491,6 @@ void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph
|
|||
}
|
||||
}
|
||||
|
||||
// Deprecated, keep it only for backward compatibility.
|
||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
|
||||
UNUSED(ctx);
|
||||
ggml_graph_compute_v2(NULL, cgraph);
|
||||
}
|
||||
|
||||
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * grad = cgraph->grads[i];
|
||||
|
@ -16548,14 +16539,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|||
const int64_t * ne = tensor->ne;
|
||||
const size_t * nb = tensor->nb;
|
||||
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||
arg,
|
||||
ggml_type_name(tensor->type),
|
||||
ggml_op_name (tensor->op),
|
||||
tensor->n_dims,
|
||||
ne[0], ne[1], ne[2], ne[3],
|
||||
nb[0], nb[1], nb[2], nb[3],
|
||||
tensor->n_tasks,
|
||||
tensor->data,
|
||||
tensor->name);
|
||||
}
|
||||
|
@ -17283,7 +17273,6 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
|
|||
//
|
||||
|
||||
static enum ggml_opt_result ggml_opt_adam(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f,
|
||||
|
@ -17291,9 +17280,6 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
struct ggml_cgraph * gb) {
|
||||
GGML_ASSERT(ggml_is_scalar(f));
|
||||
|
||||
gf->n_threads = params.n_threads;
|
||||
gb->n_threads = params.n_threads;
|
||||
|
||||
// these will store the parameters we want to optimize
|
||||
struct ggml_tensor * ps[GGML_MAX_PARAMS];
|
||||
|
||||
|
@ -17340,7 +17326,18 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
// compute the function value
|
||||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
ggml_graph_compute(ctx, gb);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, gb);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
|
||||
opt->adam.fx_best = opt->adam.fx_prev;
|
||||
|
@ -17420,7 +17417,18 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
|
||||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
ggml_graph_compute(ctx, gb);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, gb);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
const float fx = ggml_get_f32_1d(f, 0);
|
||||
|
||||
|
@ -17491,7 +17499,6 @@ struct ggml_lbfgs_iteration_data {
|
|||
};
|
||||
|
||||
static enum ggml_opt_result linesearch_backtracking(
|
||||
struct ggml_context * ctx,
|
||||
const struct ggml_opt_params * params,
|
||||
int nx,
|
||||
float * x,
|
||||
|
@ -17542,7 +17549,18 @@ static enum ggml_opt_result linesearch_backtracking(
|
|||
|
||||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
ggml_graph_compute(ctx, gb);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params->n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, gb);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_opt_get_grad(np, ps, g);
|
||||
|
||||
|
@ -17610,9 +17628,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
}
|
||||
}
|
||||
|
||||
gf->n_threads = params.n_threads;
|
||||
gb->n_threads = params.n_threads;
|
||||
|
||||
const int m = params.lbfgs.m;
|
||||
|
||||
// these will store the parameters we want to optimize
|
||||
|
@ -17664,7 +17679,17 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
|
||||
ggml_graph_reset (gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
ggml_graph_compute(ctx, gb);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, gb);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_opt_get_grad(np, ps, g);
|
||||
|
||||
|
@ -17723,7 +17748,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|||
ggml_vec_cpy_f32(nx, xp, x);
|
||||
ggml_vec_cpy_f32(nx, gp, g);
|
||||
|
||||
ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
|
||||
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
|
||||
|
||||
if (ls < 0) {
|
||||
// linesearch failed - go back to the previous point and return
|
||||
|
@ -18025,7 +18050,7 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|||
switch (opt->params.type) {
|
||||
case GGML_OPT_ADAM:
|
||||
{
|
||||
result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
|
||||
result = ggml_opt_adam(opt, opt->params, f, gf, gb);
|
||||
} break;
|
||||
case GGML_OPT_LBFGS:
|
||||
{
|
||||
|
|
52
ggml.h
52
ggml.h
|
@ -65,7 +65,16 @@
|
|||
// ggml_set_f32(a, 3.0f);
|
||||
// ggml_set_f32(b, 4.0f);
|
||||
//
|
||||
// ggml_graph_compute(ctx0, &gf);
|
||||
// const int n_threads = 1;
|
||||
// struct ggml_graph_compute_plan ctx = ggml_graph_compute_make_plan(&gf, n_threads);
|
||||
// if (ctx.work_size > 0) {
|
||||
// ctx.work_data = malloc(ctx.work_size);
|
||||
// GGML_ASSERT(ctx.work_data);
|
||||
// }
|
||||
// ggml_graph_compute(&ctx, &gf);
|
||||
// if (ctx.work_data) {
|
||||
// free(ctx.work_data);
|
||||
// }
|
||||
//
|
||||
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
||||
//
|
||||
|
@ -418,9 +427,6 @@ extern "C" {
|
|||
struct ggml_tensor * src1;
|
||||
struct ggml_tensor * opt[GGML_MAX_OPT];
|
||||
|
||||
// thread scheduling
|
||||
int n_tasks;
|
||||
|
||||
// performance
|
||||
int perf_runs;
|
||||
int64_t perf_cycles;
|
||||
|
@ -432,27 +438,30 @@ extern "C" {
|
|||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
char padding[4];
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
|
||||
// graph compute context
|
||||
struct ggml_cgraph_context {
|
||||
// After call to `ggml_graph_compute_plan()`, `planned` is set as true,
|
||||
// `work_size` will be updated as non-zero when buffer is required. When
|
||||
// need buffer, caller MUST allocate memory for `work_data`.
|
||||
// See https://github.com/ggerganov/ggml/issues/287
|
||||
// The default graph compute plan that needs to be prepared for ggml_graph_compute().
|
||||
// Since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_graph_compute_plan {
|
||||
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
|
||||
size_t work_size;
|
||||
// Worker buffer.
|
||||
// Expect allocate/free by caller before/after calling to `ggml_graph_compute()`.
|
||||
void * work_data;
|
||||
bool planned; // true means ready to compute graph nodes.
|
||||
|
||||
int n_threads;
|
||||
|
||||
// The `n_tasks` of nodes, 1:1 mapping to cgraph nodes.
|
||||
int n_tasks[GGML_MAX_NODES];
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
int n_threads;
|
||||
|
||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
||||
|
@ -1305,19 +1314,10 @@ extern "C" {
|
|||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
|
||||
// Since https://github.com/ggerganov/ggml/issues/287
|
||||
GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
|
||||
// Since https://github.com/ggerganov/ggml/issues/287
|
||||
// When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
|
||||
// Another use case: allocate buffer explicitly:
|
||||
// - call `ggml_graph_compute_plan()`;
|
||||
// - allocate memory for `ctx->work_data`;
|
||||
// - finally call `ggml_graph_compute_v2()`.
|
||||
// NOTE: don't manually set `ctx->planned`.
|
||||
GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
|
||||
// Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
|
||||
// See https://github.com/ggerganov/ggml/issues/287
|
||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
// ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute().
|
||||
// Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data.
|
||||
GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/);
|
||||
GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||
|
|
68
llama.cpp
68
llama.cpp
|
@ -1309,7 +1309,7 @@ static bool llama_eval_internal(
|
|||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
const int actual_n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -1612,10 +1612,30 @@ static bool llama_eval_internal(
|
|||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||
}
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (cgraph_fname) {
|
||||
|
@ -2966,8 +2986,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
|||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||
gf.n_threads = n_threads;
|
||||
ggml_graph_compute(lora_ctx, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
ggml_free(lora_ctx);
|
||||
|
@ -3120,7 +3150,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||
|
||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kout3d->data = out;
|
||||
|
@ -3140,7 +3169,18 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
@ -3226,7 +3266,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
|
||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kin3d->data = (void *) inp;
|
||||
|
@ -3246,7 +3285,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
|
|
@ -215,15 +215,36 @@ bool check_gradient(
|
|||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
||||
gb.n_threads = n_threads;
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_graph_reset (&gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
ggml_graph_compute(ctx0, &gb);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gb);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
||||
// ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
|
||||
|
@ -236,12 +257,34 @@ bool check_gradient(
|
|||
const float xm = x0 - eps;
|
||||
const float xp = x0 + eps;
|
||||
set_element(x[i], k, xp);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
const float f0 = ggml_get_f32_1d(f, 0);
|
||||
|
||||
set_element(x[i], k, xm);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gf);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
const float f1 = ggml_get_f32_1d(f, 0);
|
||||
|
||||
|
@ -252,7 +295,18 @@ bool check_gradient(
|
|||
// compute gradient using backward graph
|
||||
ggml_graph_reset (&gf);
|
||||
ggml_set_f32 (f->grad, 1.0f);
|
||||
ggml_graph_compute(ctx0, &gb);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &gb);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
const float g1 = get_element(x[i]->grad, k);
|
||||
|
||||
|
|
|
@ -140,7 +140,19 @@ int main(int argc, const char ** argv) {
|
|||
|
||||
struct ggml_cgraph ge = ggml_build_forward(e);
|
||||
ggml_graph_reset (&ge);
|
||||
ggml_graph_compute(ctx, &ge);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &ge);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
const float fe = ggml_get_f32_1d(e, 0);
|
||||
printf("%s: e = %.4f\n", __func__, fe);
|
||||
|
||||
|
@ -149,7 +161,19 @@ int main(int argc, const char ** argv) {
|
|||
ggml_opt(ctx, opt_params, e);
|
||||
|
||||
ggml_graph_reset (&ge);
|
||||
ggml_graph_compute(ctx, &ge);
|
||||
|
||||
{
|
||||
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
|
||||
if (plan.work_size > 0) {
|
||||
plan.work_data = malloc(plan.work_size);
|
||||
GGML_ASSERT(plan.work_data);
|
||||
}
|
||||
ggml_graph_compute(&plan, &ge);
|
||||
if (plan.work_data) {
|
||||
free(plan.work_data);
|
||||
}
|
||||
}
|
||||
|
||||
const float fe_opt = ggml_get_f32_1d(e, 0);
|
||||
printf("%s: original e = %.4f\n", __func__, fe);
|
||||
printf("%s: optimized e = %.4f\n", __func__, fe_opt);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue