From b1331d7e604eeae9b9b0e4f7b3a50b70b49c1b44 Mon Sep 17 00:00:00 2001 From: mqy Date: Tue, 4 Jul 2023 20:38:46 +0800 Subject: [PATCH] reusable buffers --- examples/baby-llama/baby-llama.cpp | 23 +++--- examples/benchmark/benchmark-matmult.cpp | 29 +++----- .../train-text-from-scratch.cpp | 29 +++----- ggml.c | 3 +- ggml.h | 2 +- llama.cpp | 70 ++++++++----------- tests/test-grad0.c | 64 +++++++++-------- tests/test-opt.c | 46 ++++++++---- 8 files changed, 129 insertions(+), 137 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index f147c23a2..785e7e886 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; + auto compute_plan_buffer = std::vector(); + for (int ex=0; ex 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } float error_before_opt = ggml_get_f32_1d(e, 0); @@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) { { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) { { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index e4f361e13..e7d75c9ae 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -164,16 +164,15 @@ int main(int argc, char ** argv) { TENSOR_DUMP(m11); TENSOR_DUMP(m2); + auto compute_plan_buffer = std::vector(); + { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } TENSOR_DUMP(gf.nodes[0]); @@ -229,15 +228,12 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf31); - if (plan.work_data) { - free(plan.work_data); - } } long long int stop = ggml_time_us(); @@ -272,15 +268,12 @@ int main(int argc, char ** argv) { // Running a different graph computation to make sure we override the CPU cache lines { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf32); - if (plan.work_data) { - free(plan.work_data); - } } } printf("\n"); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 83da31531..0345b8dc0 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } + auto compute_plan_buffer = std::vector(); + printf("%s: begin training\n", __func__); for (int ex = 0; ex < params.n_examples; ++ex) { @@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) { } { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads); + auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, gf); - if (plan.work_data) { - free(plan.work_data); - } } size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) { model.train_tokens += n_batch * n_tokens; { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads); + auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, gf); - if (plan.work_data) { - free(plan.work_data); - } } float error_after_opt = ggml_get_f32_1d(loss, 0); @@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, logits); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); diff --git a/ggml.c b/ggml.c index 0e906d0c3..94a710706 100644 --- a/ggml.c +++ b/ggml.c @@ -15974,7 +15974,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const struct ggml_cgraph * cgraph = state->shared->cgraph; const struct ggml_graph_compute_plan * plan = state->shared->plan; - const int *n_tasks_arr = plan->n_tasks; + const int * n_tasks_arr = plan->n_tasks; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -16490,6 +16490,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap } } +// TODO: avoid allocating memory frequently. static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads); if (plan.work_size > 0) { diff --git a/ggml.h b/ggml.h index 0f1bd138b..1b50ab866 100644 --- a/ggml.h +++ b/ggml.h @@ -449,7 +449,7 @@ extern "C" { // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`. size_t work_size; // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`. - void * work_data; + uint8_t * work_data; int n_threads; diff --git a/llama.cpp b/llama.cpp index d1ae57298..c29d46d8d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -321,6 +321,10 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; + // reusable buffer for `struct ggml_graph_compute_plan.work_data` + // std::vector guarantees the elements are stored contiguously. + std::vector compute_plan_buffer; + // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; @@ -1591,10 +1595,13 @@ static bool llama_eval_internal( // run the computation ggml_build_forward_expand(&gf, cur); + bool call_ggml_graph_compute = true; + #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); + call_ggml_graph_compute = false; } else { // IMPORTANT: // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla @@ -1611,33 +1618,18 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k); ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } - ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } - } - } -#else - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } - ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } #endif + if (call_ggml_graph_compute) { + auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); + if (plan.work_size > 0) { + lctx.compute_plan_buffer.resize(plan.work_size); + plan.work_data = lctx.compute_plan_buffer.data(); + } + ggml_graph_compute(&plan, &gf); + } + if (cgraph_fname) { ggml_graph_export(&gf, cgraph_fname); } @@ -2822,6 +2814,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // read tensors and apply bool warned = false; int n_tensors = 0; + + auto compute_plan_buffer = std::vector(); + while (true) { int32_t n_dims; int32_t length; @@ -2988,15 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const struct ggml_cgraph gf = ggml_build_forward(r); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); + auto plan = ggml_graph_compute_make_plan(&gf, n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } // we won't need these tensors again, reset the context to save memory @@ -3171,15 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + ctx->compute_plan_buffer.resize(plan.work_size); + plan.work_data = ctx->compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } ggml_free(cpy_ctx); @@ -3287,15 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + ctx->compute_plan_buffer.resize(plan.work_size); + plan.work_data = ctx->compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } ggml_free(cpy_ctx); diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 477fedfee..548547727 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -191,6 +191,32 @@ void print_elements(const char* label, const struct ggml_tensor * t) { } +struct compute_plan_buffer { + size_t size; + uint8_t * data; +}; + +static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { + if (size == 0) { + return NULL; + } + + GGML_ASSERT(buf); + + if (buf->size == 0) { + buf->data = malloc(size); + buf->size = size; + } else if (buf->size < size) { + buf->data = realloc(buf->data, size); + buf->size = size; + } else { + // skip shrinking. + } + + GGML_ASSERT(buf->data); + return buf->data; +} + bool check_gradient( const char * op_name, struct ggml_context * ctx0, @@ -218,6 +244,8 @@ bool check_gradient( struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; + { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); if (plan.work_size > 0) { @@ -235,14 +263,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gb); - if (plan.work_data) { - free(plan.work_data); - } } // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); @@ -259,14 +281,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } const float f0 = ggml_get_f32_1d(f, 0); @@ -275,14 +291,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } const float f1 = ggml_get_f32_1d(f, 0); @@ -297,14 +307,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gb); - if (plan.work_data) { - free(plan.work_data); - } } const float g1 = get_element(x[i]->grad, k); @@ -321,6 +325,10 @@ bool check_gradient( } } + if (plan_buf.data) { + free(plan_buf.data); + } + return true; } diff --git a/tests/test-opt.c b/tests/test-opt.c index cb0d58199..35d070dc7 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -114,6 +114,31 @@ void set_element(struct ggml_tensor * t, int idx, float value) { ((float *)t->data)[idx] = value; } + +struct compute_plan_buffer { + size_t size; + uint8_t * data; +}; + +static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { + if (size == 0) { + return NULL; + } + + if (buf->size == 0) { + buf->data = malloc(size); + buf->size = size; + } else if (buf->size < size) { + buf->data = realloc(buf->data, size); + buf->size = size; + } else { + // skip shrinking. + } + + GGML_ASSERT(buf->data); + return buf->data; +} + int main(int argc, const char ** argv) { struct ggml_init_params params = { .mem_size = 1024*1024*1024, @@ -141,16 +166,11 @@ int main(int argc, const char ** argv) { struct ggml_cgraph ge = ggml_build_forward(e); ggml_graph_reset (&ge); + struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &ge); - if (plan.work_data) { - free(plan.work_data); - } } const float fe = ggml_get_f32_1d(e, 0); @@ -164,14 +184,12 @@ int main(int argc, const char ** argv) { { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &ge); - if (plan.work_data) { - free(plan.work_data); - } + } + + if (plan_buf.data) { + free(plan_buf.data); } const float fe_opt = ggml_get_f32_1d(e, 0);