diff --git a/llama.cpp b/llama.cpp index 5c9aea9de..0aecbeedc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } +// +// ggml helpers +// + +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +// +// memory sizes +// + static const std::map & MEM_REQ_SCRATCH0() { static std::map k_sizes = { @@ -761,7 +780,6 @@ struct llama_model_loader { }; - // // kv cache // @@ -1623,12 +1641,7 @@ static bool llama_eval_internal( #endif if (call_ggml_graph_compute) { - ggml_cplan pf = ggml_graph_plan(&gf, n_threads); - if (pf.work_size > 0) { - lctx.work_buffer.resize(pf.work_size); - pf.work_data = lctx.work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); } if (cgraph_fname) { @@ -2983,14 +2996,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const struct ggml_cgraph gf = ggml_build_forward(r); - { - ggml_cplan pf = ggml_graph_plan(&gf, n_threads); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3162,15 +3168,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - - { - ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); - if (pf.work_size > 0) { - ctx->work_buffer.resize(pf.work_size); - pf.work_data = ctx->work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } @@ -3275,15 +3273,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - - { - ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); - if (pf.work_size > 0) { - ctx->work_buffer.resize(pf.work_size); - pf.work_data = ctx->work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); }