From 9c9bdaf0b8e9e3d04c0caa83a7722a14b629e475 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:18:42 +0300 Subject: [PATCH] llama : fix duplicate symbols + refactor example benchmark --- examples/benchmark/benchmark-matmult.cpp | 38 +++++++++--------------- llama.cpp | 2 +- 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 840f4fe52..f7215f43b 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -20,6 +20,17 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + float tensor_sum_elements(const ggml_tensor * tensor) { float sum = 0; if (tensor->type==GGML_TYPE_F32) { @@ -166,14 +177,7 @@ int main(int argc, char ** argv) { std::vector work_buffer; - { - ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads); TENSOR_DUMP(gf.nodes[0]); @@ -227,14 +231,7 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); - { - ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads); - if (pf31.work_size > 0) { - work_buffer.resize(pf31.work_size); - pf31.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf31, &pf31); - } + ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads); long long int stop = ggml_time_us(); long long int usec = stop-start; @@ -267,14 +264,7 @@ int main(int argc, char ** argv) { } // Running a different graph computation to make sure we override the CPU cache lines - { - ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads); - if (pf32.work_size > 0) { - work_buffer.resize(pf32.work_size); - pf32.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf32, &pf32); - } + ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads); } printf("\n"); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); diff --git a/llama.cpp b/llama.cpp index 0aecbeedc..5221ab5a2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -83,7 +83,7 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default // ggml helpers // -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) {