From 8781013ef654270cbead3e0011e33a6d690fb168 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Mon, 18 Sep 2023 10:03:53 -0400 Subject: [PATCH 1/5] make : restore build-info.h dependency for several targets (#3205) --- Makefile | 14 +++++++------- common/common.h | 1 - examples/benchmark/benchmark-matmult.cpp | 1 + examples/embd-input/embd-input-lib.cpp | 1 + examples/embedding/embedding.cpp | 1 + examples/perplexity/perplexity.cpp | 1 + examples/quantize-stats/quantize-stats.cpp | 1 + examples/quantize/quantize.cpp | 1 + examples/save-load-state/save-load-state.cpp | 1 + 9 files changed, 14 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index dc8ae3807..e07db8afa 100644 --- a/Makefile +++ b/Makefile @@ -514,22 +514,22 @@ main: examples/main/main.cpp build-info.h ggml. @echo '==== Run ./main -h for help. ====' @echo -simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS) +simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) +quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) +perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) +embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS) +save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) @@ -582,7 +582,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh tests: $(TEST_TARGETS) -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ diff --git a/common/common.h b/common/common.h index f9dfd4a2c..18aea38ce 100644 --- a/common/common.h +++ b/common/common.h @@ -3,7 +3,6 @@ #pragma once #include "llama.h" -#include "build-info.h" #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 561309acb..b16ad24d3 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,3 +1,4 @@ +#include "build-info.h" #include "common.h" #include "ggml.h" diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index fc6e44eb2..c995eef35 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -1,3 +1,4 @@ +#include "build-info.h" #include "common.h" #include "embd-input.h" diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 0788f362c..27d605f4e 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,3 +1,4 @@ +#include "build-info.h" #include "common.h" #include "llama.h" diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 4958cdfb9..2b375e34e 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,3 +1,4 @@ +#include "build-info.h" #include "common.h" #include "llama.h" diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 9f930dede..94edb94d9 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,4 +1,5 @@ #define LLAMA_API_INTERNAL +#include "build-info.h" #include "common.h" #include "ggml.h" #include "llama.h" diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index acb79e690..1c1d957e6 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,3 +1,4 @@ +#include "build-info.h" #include "common.h" #include "llama.h" diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index eac307904..95527bb86 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,3 +1,4 @@ +#include "build-info.h" #include "common.h" #include "llama.h" From d119c04c159d015a93567df7e73e0e45a22d0f1d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 Sep 2023 10:02:39 +0300 Subject: [PATCH 2/5] examples : fix benchmark-matmult (#1554) The precision for Q4_0 has degraded since #1508 --- examples/benchmark/benchmark-matmult.cpp | 28 +++++++++++++----------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index b16ad24d3..c8f7d4869 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -33,11 +33,11 @@ void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, } float tensor_sum_elements(const ggml_tensor * tensor) { - float sum = 0; - if (tensor->type==GGML_TYPE_F32) { + double sum = 0; + if (tensor->type == GGML_TYPE_F32) { for (int j = 0; j < tensor->ne[1]; j++) { for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; } } } @@ -126,12 +126,15 @@ int main(int argc, char ** argv) { //printf("Memsize required = %i\n", sizex*sizex); + // TODO: perform the bench for all types or for a user specified type + const ggml_type qtype = GGML_TYPE_Q4_1; + size_t ctx_size = 0; ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); + ctx_size += sizex*sizey*ggml_type_sizef(qtype); + ctx_size += sizex*sizey*ggml_type_sizef(qtype); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += 1024*1024*16; @@ -164,7 +167,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); ggml_set_f32(m2, 2.0f); - printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); + printf("\n------ Test 1 - Matrix Mult via F32 code\n"); // printf("Creating new tensor m11xm2\n"); struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); @@ -182,17 +185,16 @@ int main(int argc, char ** argv) { TENSOR_DUMP(gf.nodes[0]); - printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); + printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); int32_t nelements = sizex*sizey; - int32_t ne[2] = { sizex, sizey }; std::vector hist_cur(1 << 4, 0); // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); - struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); - ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); + struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); // Set up a the compute graph // printf("Creating new tensor q31\n"); @@ -203,8 +205,8 @@ int main(int argc, char ** argv) { // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); - struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); - ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); + struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); @@ -221,7 +223,7 @@ int main(int argc, char ** argv) { printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - // Let's use the F32 result from above as a reference for the q4_0 multiplication + // Let's use the F32 result from above as a reference for the quantized multiplication float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); From a40f2b656fab364ce0aff98dbefe9bd9c3721cc9 Mon Sep 17 00:00:00 2001 From: Alon Date: Wed, 20 Sep 2023 15:06:36 +0300 Subject: [PATCH 3/5] CI: FreeBSD fix (#3258) * - freebsd ci: use qemu --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4b6071f5a..aecebaf93 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -468,6 +468,7 @@ jobs: with: operating_system: freebsd version: '13.2' + hypervisor: 'qemu' run: | sudo pkg update sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas From 80834daecf4b9021770361a6d5e1b9c7a60e6854 Mon Sep 17 00:00:00 2001 From: kang Date: Wed, 20 Sep 2023 22:48:22 +0900 Subject: [PATCH 4/5] flake : Restore default package's buildInputs (#3262) --- flake.nix | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flake.nix b/flake.nix index b0fb8642c..7723357af 100644 --- a/flake.nix +++ b/flake.nix @@ -52,7 +52,8 @@ in { packages.default = pkgs.stdenv.mkDerivation { - inherit name src meta postPatch nativeBuildInputs buildInputs postInstall; + inherit name src meta postPatch nativeBuildInputs postInstall; + buildInputs = osSpecific; cmakeFlags = cmakeFlags ++ (if isAarch64 && isDarwin then [ "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" From 65c2c1c5ab7c5089dbc6d10bc49b9c58f0164317 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Wed, 20 Sep 2023 12:06:08 -0400 Subject: [PATCH 5/5] benchmark-matmult : do not use integer abs() on a float (#3277) --- examples/benchmark/benchmark-matmult.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index c8f7d4869..f1c382aa9 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -21,7 +21,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) { @@ -32,7 +32,7 @@ void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, ggml_graph_compute(graph, &plan); } -float tensor_sum_elements(const ggml_tensor * tensor) { +static float tensor_sum_elements(const ggml_tensor * tensor) { double sum = 0; if (tensor->type == GGML_TYPE_F32) { for (int j = 0; j < tensor->ne[1]; j++) { @@ -44,7 +44,7 @@ float tensor_sum_elements(const ggml_tensor * tensor) { return sum; } -void tensor_dump(const ggml_tensor * tensor, const char * name) { +static void tensor_dump(const ggml_tensor * tensor, const char * name) { printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); @@ -59,7 +59,7 @@ struct benchmark_params_struct { int32_t n_iterations = 10; }; -void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { +static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -253,7 +253,7 @@ int main(int argc, char ** argv) { // Check that the matrix multiplication result is in the right ballpark // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]); - float delta = abs(sum_of_Q4_result - sum_of_F32_reference); + float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 if (delta > allowed_delta) {