ggml : change ggml_graph_compute() API to not require context (#1999)
* ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 * rewrite: no longer consider backward compitability; plan and make_plan * minor: rename ctx as plan; const * remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward * add static ggml_graph_compute_sugar() * minor: update comments * reusable buffers * ggml : more consistent naming + metal fixes * ggml : fix docs * tests : disable grad / opt + minor naming changes * ggml : add ggml_graph_compute_with_ctx() - backwards compatible API - deduplicates a lot of copy-paste * ci : enable test-grad0 * examples : factor out plan allocation into a helper function * llama : factor out plan stuff into a helper function * ci : fix env * llama : fix duplicate symbols + refactor example benchmark * ggml : remove obsolete assert + refactor n_tasks section * ggml : fix indentation in switch * llama : avoid unnecessary bool * ggml : remove comments from source file and match order in header --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
7242140283
commit
1d656d6360
13 changed files with 571 additions and 449 deletions
|
@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
|
|||
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
||||
}
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
plan.work_data = buf.data();
|
||||
}
|
||||
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor(
|
||||
struct ggml_tensor * tensor,
|
||||
int ndims,
|
||||
|
@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
|
|||
int n_tokens = model.hparams.n_ctx;
|
||||
int n_vocab = model.hparams.n_vocab;
|
||||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
for (int ex=0; ex<n_examples; ++ex) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ compute_size,
|
||||
|
@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
|
|||
int n_past = 0;
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = 1;
|
||||
|
||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||
|
||||
|
@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
||||
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||
|
||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
|
@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
|
|||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||
//
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||
|
||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
|
@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = 1;
|
||||
|
||||
int n_past = 0;
|
||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||
|
||||
ggml_build_forward_expand(&gf, logits);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||
|
||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||
|
@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
print_matrix(model.tok_embeddings);
|
||||
|
||||
printf("done\n");
|
||||
|
||||
// ggml_free(kv_self.ctx);
|
||||
// ggml_free(model_lora.ctx);
|
||||
ggml_free(model.ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -20,6 +20,17 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
plan.work_data = buf.data();
|
||||
}
|
||||
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||
float sum = 0;
|
||||
if (tensor->type==GGML_TYPE_F32) {
|
||||
|
@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
|
|||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
||||
|
||||
gf.n_threads=benchmark_params.n_threads;
|
||||
printf("cgraph->n_threads=%i\n",gf.n_threads);
|
||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||
|
||||
TENSOR_DUMP(m11);
|
||||
TENSOR_DUMP(m2);
|
||||
|
||||
ggml_graph_compute(ctx, &gf);
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
|
||||
|
||||
TENSOR_DUMP(gf.nodes[0]);
|
||||
|
||||
|
@ -187,7 +199,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
||||
gf31.n_threads=benchmark_params.n_threads;
|
||||
|
||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||
// printf("Creating new tensor q12 & Running quantize\n");
|
||||
|
@ -199,8 +210,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
//printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
||||
gf32.n_threads=benchmark_params.n_threads;
|
||||
printf("cgraph->n_threads=%i\n",gf31.n_threads);
|
||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||
|
||||
const int dimx = sizex;
|
||||
const int dimy = sizey;
|
||||
|
@ -221,14 +231,15 @@ int main(int argc, char ** argv) {
|
|||
|
||||
long long int start = ggml_time_us();
|
||||
//printf("Running ggml_graph_compute\n");
|
||||
ggml_graph_compute(ctx, &gf31);
|
||||
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
|
||||
|
||||
long long int stop = ggml_time_us();
|
||||
long long int usec = stop-start;
|
||||
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
||||
gflops_sum += gflops;
|
||||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
||||
i,
|
||||
gf31.n_threads,
|
||||
benchmark_params.n_threads,
|
||||
sizex, sizey, sizez, flops_per_matrix,
|
||||
usec,gflops);
|
||||
|
||||
|
@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// Running a different graph computation to make sure we override the CPU cache lines
|
||||
ggml_graph_compute(ctx, &gf32);
|
||||
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
|
||||
}
|
||||
printf("\n");
|
||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||
|
|
|
@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_context * ctx_eval = NULL;
|
||||
|
||||
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||
gf.n_threads = 1;
|
||||
|
||||
// this allocates all Metal resources and memory buffers
|
||||
auto * ctx_metal = ggml_metal_init();
|
||||
auto * ctx_metal = ggml_metal_init(1);
|
||||
|
||||
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
||||
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
||||
|
|
|
@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
|
|||
return rnd->rd(rnd->gen);
|
||||
}
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
plan.work_data = buf.data();
|
||||
}
|
||||
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
||||
float scale = 1.0f; // xavier
|
||||
switch (tensor->n_dims) {
|
||||
|
@ -1426,11 +1437,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
|||
|
||||
gf->n_nodes = 0;
|
||||
gf->n_leafs = 0;
|
||||
gf->work_size = 0;
|
||||
gf->perf_runs = 0;
|
||||
gf->perf_cycles = 0;
|
||||
gf->perf_time_us = 0;
|
||||
gf->work = NULL;
|
||||
|
||||
const auto & hparams = model->hparams;
|
||||
//const int n_ctx = hparams.n_ctx;
|
||||
|
@ -3162,6 +3171,7 @@ int main(int argc, char ** argv) {
|
|||
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
|
||||
// ggml_print_tensor_objects(model.ctx);
|
||||
|
||||
// TODO: use std::vector<uint8_t> intead of "new"
|
||||
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
||||
uint8_t * compute_addr = new uint8_t[compute_size];
|
||||
|
||||
|
@ -3183,6 +3193,8 @@ int main(int argc, char ** argv) {
|
|||
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
|
||||
}
|
||||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
|
||||
printf("%s: begin training\n", __func__);
|
||||
|
||||
for (int ex = 0; ex < params.n_examples; ++ex) {
|
||||
|
@ -3217,9 +3229,6 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
||||
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
||||
|
||||
// ggml_cgraph gf = {};
|
||||
gf->n_threads = params.n_threads;
|
||||
gb->n_threads = params.n_threads;
|
||||
|
||||
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
|
||||
|
||||
|
@ -3248,7 +3257,7 @@ int main(int argc, char ** argv) {
|
|||
*gb = ggml_build_backward(ctx0, gf, true);
|
||||
}
|
||||
|
||||
ggml_graph_compute(ctx0, gf);
|
||||
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
|
||||
|
||||
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
||||
|
||||
|
@ -3272,7 +3281,7 @@ int main(int argc, char ** argv) {
|
|||
model.train_samples += n_batch;
|
||||
model.train_tokens += n_batch * n_tokens;
|
||||
|
||||
ggml_graph_compute(ctx0, gf);
|
||||
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
|
||||
|
||||
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
||||
|
||||
|
@ -3354,13 +3363,12 @@ int main(int argc, char ** argv) {
|
|||
struct ggml_context * ctx0 = ggml_init(cparams);
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = params.n_threads;
|
||||
|
||||
int n_past = 0;
|
||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||
|
||||
ggml_build_forward_expand(&gf, logits);
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
|
||||
|
||||
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||
|
@ -3386,6 +3394,7 @@ int main(int argc, char ** argv) {
|
|||
delete[] compute_addr;
|
||||
delete[] compute_buf_0;
|
||||
delete[] compute_buf_1;
|
||||
|
||||
llama_free(lctx);
|
||||
llama_free_model(lmodel);
|
||||
ggml_free(model.ctx);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue