From 82d146df9b43cf677e0dbce20b03cf864958a0cc Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Thu, 13 Apr 2023 11:33:16 +0200 Subject: [PATCH 01/17] do not force the prompt file to end with a new line (#908) --- .editorconfig | 3 +++ prompts/chat-with-bob.txt | 2 +- prompts/reason-act.txt | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.editorconfig b/.editorconfig index df8aaf504..135a7e4bc 100644 --- a/.editorconfig +++ b/.editorconfig @@ -14,3 +14,6 @@ indent_size = 4 [Makefile] indent_style = tab + +[prompts/*.txt] +insert_final_newline = unset diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt index 009da39ae..ad494d831 100644 --- a/prompts/chat-with-bob.txt +++ b/prompts/chat-with-bob.txt @@ -4,4 +4,4 @@ User: Hello, Bob. Bob: Hello. How may I help you today? User: Please tell me the largest city in Europe. Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. -User: +User: \ No newline at end of file diff --git a/prompts/reason-act.txt b/prompts/reason-act.txt index 872016631..a4f4f4ee6 100644 --- a/prompts/reason-act.txt +++ b/prompts/reason-act.txt @@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333 Question: What is capital of france? Thought: Do I need to use an action? No, I know the answer Answer: Paris is the capital of France -Question: +Question: \ No newline at end of file From 95ea26f6e92d620a5437f576b80868aee7f808d6 Mon Sep 17 00:00:00 2001 From: SebastianApel <13675545+SebastianApel@users.noreply.github.com> Date: Thu, 13 Apr 2023 14:46:23 +0200 Subject: [PATCH 02/17] benchmark : add tool for timing q4_0 matrix multiplication (#653) * Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov --- Makefile | 7 +- examples/benchmark/benchmark-q4_0-matmult.c | 270 ++++++++++++++++++++ 2 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 examples/benchmark/benchmark-q4_0-matmult.c diff --git a/Makefile b/Makefile index 3e58a28a7..fe2f26ecb 100644 --- a/Makefile +++ b/Makefile @@ -149,7 +149,7 @@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize quantize-stats perplexity embedding + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o libllama.so: llama.o ggml.o $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) + # # Tests # +benchmark: ggml.o + $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) + ./benchmark-q4_0-matmult + .PHONY: tests tests: bash ./tests/run-tests.sh diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c new file mode 100644 index 000000000..9ca9b133a --- /dev/null +++ b/examples/benchmark/benchmark-q4_0-matmult.c @@ -0,0 +1,270 @@ +/* + License: MIT License + + Changelog: + - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel) + +*/ + +#include +#include "ggml.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +float tensor_sum_elements(struct ggml_tensor * tensor) { + float sum = 0; + if (tensor->type==6) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + } + } + } + return sum; +} + + +/* + These are mapping to unknown + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_COUNT, +*/ + +#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" + +#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \ + TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ + TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ + { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } + +struct benchmark_params_struct { + int32_t n_threads = 1; + int32_t n_iterations = 10; +}; + +void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); + fprintf(stderr, "\n"); +} + +int main(int argc, char ** argv) { + + + struct benchmark_params_struct benchmark_params; + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_threads = std::stoi(argv[i]); + } else if (arg == "-i" || arg == "--iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_iterations = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + print_usage(argc, argv, benchmark_params); + exit(0); + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + print_usage(argc, argv, benchmark_params); + exit(1); + } + } + + + // create the ggml context + printf("Starting Test\n"); + + + + struct ggml_context * ctx; + //const int sizex = 4096; + //const int sizey = 11008; + +#undef VERBOSE_DEBUGGING +#ifndef VERBOSE_DEBUGGING + const int sizey = 4096; + const int sizex = 11008; + const int sizez = 128; +#else + /* Working - let's increase size */ + const int sizey = 1; + const int sizex = (8*32); + const int sizez = 1; + + /*const int sizey = 1; + const int sizex = 3*(8*32); + const int sizez = 1;*/ +#endif + + //printf("Memsize required = %i\n", sizex*sizex); + ggml_type wtype = GGML_TYPE_F32; + + size_t ctx_size = 0; + ctx_size += sizex*sizey*ggml_type_sizef(wtype); + ctx_size += sizex*sizey*ggml_type_sizef(wtype); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += sizex*sizeof(float); + ctx_size += 1024*1024*100; + + printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + ctx = ggml_init(params); + if (!ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + + + printf("Creating new tensors\n"); + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m11, 1.0f); + + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m12, 1.5f); + + // printf("Creating new tensor m2\n"); + struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); + ggml_set_f32(m2, 2.0f); + + printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); + // printf("Creating new tensor m11xm2\n"); + struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph gf = ggml_build_forward(m11xm2); + + gf.n_threads=benchmark_params.n_threads; + printf("cgraph->n_threads=%i\n",gf.n_threads); + + TENSOR_DUMP(m11); + TENSOR_DUMP(m2); + + ggml_graph_compute(ctx, &gf); + + TENSOR_DUMP(gf.nodes[0]); + + printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); + + int32_t nelements = sizex*sizey; + int32_t ne[2] = { sizex, sizey }; + + std::vector hist_cur(1 << 4, 0); + + // Set up a the benchmark matrices + // printf("Creating new tensor q11 & Running quantize\n"); + struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); + ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); + + // Set up a the compute graph + // printf("Creating new tensor q31\n"); + struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph gf31 = ggml_build_forward(q31); + gf31.n_threads=benchmark_params.n_threads; + + // Set up a second graph computation to make sure we override the CPU cache lines + // printf("Creating new tensor q12 & Running quantize\n"); + struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); + ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); + + // printf("Creating new tensor q32\n"); + struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); + + //printf("Creating compute graph\n"); + struct ggml_cgraph gf32 = ggml_build_forward(q32); + gf32.n_threads=benchmark_params.n_threads; + printf("cgraph->n_threads=%i\n",gf31.n_threads); + + const int dimx = sizex; + const int dimy = sizey; + const int dimz = sizez; + long long int flops_per_dot_product = dimy + dimy; + long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; + printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); + + + // Let's use the F32 result from above as a reference for the q4_0 multiplication + float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); + + + printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n"); + printf("==============================================================================================\n"); + + for (int i=0;i allowed_delta) { + printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", + sum_of_F32_reference, + sum_of_Q4_result, + delta, + allowed_delta + ); + exit(0); + } + + // Running a different graph computation to make sure we override the CPU cache lines + ggml_graph_compute(ctx, &gf32); + + } + +} From 585d91a156794d30eec16ebe67c8d7a1d41406c1 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Thu, 13 Apr 2023 15:48:21 +0300 Subject: [PATCH 03/17] cmake : add explicit F16C option (x86) (#576) Fixes building for x86 processors missing F16C featureset MSVC not included, as in MSVC F16C is implied with AVX2/AVX512 --- CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bec1f97b..affff3ea1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,10 @@ option(LLAMA_AVX "llama: enable AVX" option(LLAMA_AVX2 "llama: enable AVX2" ON) option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_FMA "llama: enable FMA" ON) +# in MSVC F16C is implied with AVX2/AVX512 +if (NOT MSVC) + option(LLAMA_F16C "llama: enable F16C" ON) +endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) @@ -207,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") add_compile_options(/arch:AVX) endif() else() - add_compile_options(-mf16c) + if (LLAMA_F16C) + add_compile_options(-mf16c) + endif() if (LLAMA_FMA) add_compile_options(-mfma) endif() From 107980d970808c2ccf9334ad033e2782a560b911 Mon Sep 17 00:00:00 2001 From: niansa/tuxifan Date: Thu, 13 Apr 2023 15:03:39 +0200 Subject: [PATCH 04/17] examples : add -n to alpaca and gpt4all scripts (#706) --- examples/alpaca.sh | 2 +- examples/gpt4all.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/alpaca.sh b/examples/alpaca.sh index 4c9aa5077..8d6261730 100755 --- a/examples/alpaca.sh +++ b/examples/alpaca.sh @@ -7,4 +7,4 @@ cd `dirname $0` cd .. -./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 +./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh index d974f95a9..5fd739e55 100755 --- a/examples/gpt4all.sh +++ b/examples/gpt4all.sh @@ -10,6 +10,6 @@ cd .. ./main --color --instruct --threads 4 \ --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \ --file ./prompts/alpaca.txt \ - --batch_size 8 --ctx_size 2048 \ + --batch_size 8 --ctx_size 2048 -n -1 \ --repeat_last_n 64 --repeat_penalty 1.3 \ --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95 From 8c3ffc2f048a372639906fb30ec3c2070288d3be Mon Sep 17 00:00:00 2001 From: Vladimir Date: Thu, 13 Apr 2023 15:24:30 +0200 Subject: [PATCH 05/17] ggml : update cblas_sgemm columns var to be more reasonable (#838) --- ggml.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index a26b4853f..546da30d1 100644 --- a/ggml.c +++ b/ggml.c @@ -6435,7 +6435,7 @@ static void ggml_compute_forward_mul_mat_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } @@ -6607,7 +6607,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } @@ -6820,7 +6820,7 @@ static void ggml_compute_forward_mul_mat_q_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } From 4579af95e8e16910f6dbab0994917a5b3901f0cf Mon Sep 17 00:00:00 2001 From: Judd Date: Thu, 13 Apr 2023 21:43:22 +0800 Subject: [PATCH 06/17] zig : update build.zig (#872) * update * update readme * minimize the changes. --------- Co-authored-by: zjli2019 --- README.md | 40 +++++++++++++++++++++++++++++++--------- build.zig | 22 ++++++++-------------- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index dbc088532..c0958ebd6 100644 --- a/README.md +++ b/README.md @@ -149,21 +149,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the step for the LLaMA-7B model: +Here are the step for the LLaMA-7B model. + +### Get the Code ```bash -# build this repo git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -make +``` -#For Windows and CMake, use the following command instead: -cd -mkdir build -cd build -cmake .. -cmake --build . --config Release +### Build +Note: For Windows, CMake or Zig can be used. + +1. Use `make` + + ```bash + make + ``` + +1. Use CMake + + ```bash + mkdir build + cd build + cmake .. + cmake --build . --config Release + ``` + +1. Use Zig + + ```bash + zig build -Drelease-fast + ``` + +### Prepare Data & Run + +```bash # obtain the original LLaMA model weights and place them in ./models ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model diff --git a/build.zig b/build.zig index defc2c3ad..306127ffe 100644 --- a/build.zig +++ b/build.zig @@ -1,16 +1,14 @@ const std = @import("std"); -pub fn build(b: *std.Build) void { +pub fn build(b: *std.build.Builder) void { const target = b.standardTargetOptions(.{}); - const optimize = b.standardOptimizeOption(.{}); + const optimize = b.standardReleaseOptions(); const want_lto = b.option(bool, "lto", "Want -fLTO"); - const lib = b.addStaticLibrary(.{ - .name = "llama", - .target = target, - .optimize = optimize, - }); + const lib = b.addStaticLibrary("llama", null); lib.want_lto = want_lto; + lib.setTarget(target); + lib.setBuildMode(optimize); lib.linkLibCpp(); lib.addIncludePath("."); lib.addIncludePath("examples"); @@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void { fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { const b = args.b; const lib = args.lib; - const target = args.target; - const optimize = args.optimize; const want_lto = args.want_lto; - const exe = b.addExecutable(.{ - .name = name, - .target = target, - .optimize = optimize, - }); + const exe = b.addExecutable(name, null); exe.want_lto = want_lto; + lib.setTarget(args.target); + lib.setBuildMode(args.optimize); exe.addIncludePath("."); exe.addIncludePath("examples"); exe.addCSourceFiles(&.{ From c729ff730a46a135817a3d9988a097e3678a9722 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Thu, 13 Apr 2023 15:49:05 +0200 Subject: [PATCH 07/17] flake.nix: add all binaries from bin (#848) --- flake.nix | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/flake.nix b/flake.nix index cd1b6d28e..91d2edd79 100644 --- a/flake.nix +++ b/flake.nix @@ -28,10 +28,8 @@ ]; installPhase = '' mkdir -p $out/bin - mv bin/main $out/bin/llama - mv bin/quantize $out/bin/quantize - mv bin/embedding $out/bin/embedding - mv bin/perplexity $out/bin/perplexity + mv bin/* $out/bin/ + mv $out/bin/main $out/bin/llama echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml From 7e941b95eba067cb5b92785e642fd803657376ee Mon Sep 17 00:00:00 2001 From: "Genkagaku.GPT" Date: Thu, 13 Apr 2023 21:54:27 +0800 Subject: [PATCH 08/17] readme : llama node binding (#911) * chore: add nodejs binding * chore: add nodejs binding --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c0958ebd6..a7f220eb2 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ New features will probably be added mostly through community contributions. - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) +- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) **UI:** From ec29272175d7a79681d9919f3e755b1bcefa0478 Mon Sep 17 00:00:00 2001 From: CRD716 Date: Thu, 13 Apr 2023 08:59:53 -0500 Subject: [PATCH 09/17] readme : remove python 3.10 warning (#929) --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index a7f220eb2..c88e0de28 100644 --- a/README.md +++ b/README.md @@ -204,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` -Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11. - When running the larger models, make sure you have enough disk space to store all the intermediate files. ### Memory/Disk Requirements From 8cda5c981d0bf4dcb7664194b2cb9a06e2dbdd54 Mon Sep 17 00:00:00 2001 From: CRD716 Date: Thu, 13 Apr 2023 09:03:57 -0500 Subject: [PATCH 10/17] fix whitespace (#944) --- Makefile | 6 +- examples/benchmark/benchmark-q4_0-matmult.c | 106 ++++++++++---------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/Makefile b/Makefile index fe2f26ecb..c7ccf462d 100644 --- a/Makefile +++ b/Makefile @@ -171,15 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o libllama.so: llama.o ggml.o $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) - + # # Tests # benchmark: ggml.o - $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) + $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) ./benchmark-q4_0-matmult - + .PHONY: tests tests: bash ./tests/run-tests.sh diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c index 9ca9b133a..90f537fd8 100644 --- a/examples/benchmark/benchmark-q4_0-matmult.c +++ b/examples/benchmark/benchmark-q4_0-matmult.c @@ -24,12 +24,12 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { float sum = 0; - if (tensor->type==6) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; - } - } + if (tensor->type==6) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + } + } } return sum; } @@ -39,7 +39,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { These are mapping to unknown GGML_TYPE_I8, GGML_TYPE_I16, - GGML_TYPE_I32, + GGML_TYPE_I32, GGML_TYPE_COUNT, */ @@ -50,7 +50,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } -struct benchmark_params_struct { +struct benchmark_params_struct { int32_t n_threads = 1; int32_t n_iterations = 10; }; @@ -67,7 +67,7 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para int main(int argc, char ** argv) { - + struct benchmark_params_struct benchmark_params; bool invalid_param = false; @@ -90,7 +90,7 @@ int main(int argc, char ** argv) { } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv, benchmark_params); exit(0); - } + } if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); print_usage(argc, argv, benchmark_params); @@ -101,9 +101,9 @@ int main(int argc, char ** argv) { // create the ggml context printf("Starting Test\n"); - - + + struct ggml_context * ctx; //const int sizex = 4096; //const int sizey = 11008; @@ -111,31 +111,31 @@ int main(int argc, char ** argv) { #undef VERBOSE_DEBUGGING #ifndef VERBOSE_DEBUGGING const int sizey = 4096; - const int sizex = 11008; + const int sizex = 11008; const int sizez = 128; #else /* Working - let's increase size */ const int sizey = 1; - const int sizex = (8*32); + const int sizex = (8*32); const int sizez = 1; /*const int sizey = 1; - const int sizex = 3*(8*32); + const int sizex = 3*(8*32); const int sizez = 1;*/ #endif //printf("Memsize required = %i\n", sizex*sizex); - ggml_type wtype = GGML_TYPE_F32; - + ggml_type wtype = GGML_TYPE_F32; + size_t ctx_size = 0; ctx_size += sizex*sizey*ggml_type_sizef(wtype); ctx_size += sizex*sizey*ggml_type_sizef(wtype); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); ctx_size += sizex*sizeof(float); - ctx_size += 1024*1024*100; - + ctx_size += 1024*1024*100; + printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); - + struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, @@ -147,88 +147,88 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } - - + + printf("Creating new tensors\n"); // printf("Creating new tensor m1\n"); struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); ggml_set_f32(m11, 1.0f); - + // printf("Creating new tensor m1\n"); struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); ggml_set_f32(m12, 1.5f); - + // printf("Creating new tensor m2\n"); struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); ggml_set_f32(m2, 2.0f); - + printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); // printf("Creating new tensor m11xm2\n"); struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); - + // printf("Creating compute graph\n"); struct ggml_cgraph gf = ggml_build_forward(m11xm2); - + gf.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf.n_threads); - + printf("cgraph->n_threads=%i\n",gf.n_threads); + TENSOR_DUMP(m11); TENSOR_DUMP(m2); - + ggml_graph_compute(ctx, &gf); TENSOR_DUMP(gf.nodes[0]); - + printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); - + int32_t nelements = sizex*sizey; int32_t ne[2] = { sizex, sizey }; - - std::vector hist_cur(1 << 4, 0); + + std::vector hist_cur(1 << 4, 0); // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); - + // Set up a the compute graph // printf("Creating new tensor q31\n"); struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); - + // printf("Creating compute graph\n"); struct ggml_cgraph gf31 = ggml_build_forward(q31); gf31.n_threads=benchmark_params.n_threads; - - // Set up a second graph computation to make sure we override the CPU cache lines + + // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); - + //printf("Creating compute graph\n"); struct ggml_cgraph gf32 = ggml_build_forward(q32); gf32.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf31.n_threads); - + printf("cgraph->n_threads=%i\n",gf31.n_threads); + const int dimx = sizex; const int dimy = sizey; const int dimz = sizez; long long int flops_per_dot_product = dimy + dimy; long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - + // Let's use the F32 result from above as a reference for the q4_0 multiplication float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); - + printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n"); printf("==============================================================================================\n"); - + for (int i=0;i allowed_delta) { printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", - sum_of_F32_reference, + sum_of_F32_reference, sum_of_Q4_result, delta, allowed_delta ); exit(0); } - - // Running a different graph computation to make sure we override the CPU cache lines + + // Running a different graph computation to make sure we override the CPU cache lines ggml_graph_compute(ctx, &gf32); - + } - + } From 6c248707f51c8a50f7792e7f7787ec481881db88 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Thu, 13 Apr 2023 16:08:32 +0200 Subject: [PATCH 11/17] ggml : introduce GGML_ALIGNED_MALLOC/GGML_ALIGNED_FREE macros (#884) which allows us to use aligned_alloc or _aligned_malloc functions --- ggml.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 546da30d1..281fd8ec9 100644 --- a/ggml.c +++ b/ggml.c @@ -114,6 +114,14 @@ typedef void* thread_ret_t; #define GGML_MEM_ALIGN 16 #endif +#if defined(_MSC_VER) || defined(__MINGW32__) +#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) +#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#else +#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size) +#define GGML_ALIGNED_FREE(ptr) free(ptr) +#endif + #define UNUSED(x) (void)(x) #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -2966,7 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { *ctx = (struct ggml_context) { /*.mem_size =*/ params.mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, @@ -3001,7 +3009,7 @@ void ggml_free(struct ggml_context * ctx) { __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); if (ctx->mem_buffer_owned) { - free(ctx->mem_buffer); + GGML_ALIGNED_FREE(ctx->mem_buffer); } found = true; From 6232f2d7fd7a22d5eeb62182b2f21fcf01359754 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Thu, 13 Apr 2023 14:59:50 +0000 Subject: [PATCH 12/17] ggml : optimize non-SIMD Q4_0 vector dot product (#703) --- ggml.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 281fd8ec9..eb47d8298 100644 --- a/ggml.c +++ b/ggml.c @@ -2160,18 +2160,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const uint8_t * restrict p0 = x[i].qs; const uint8_t * restrict p1 = y[i].qs; + int sumi = 0; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; - const float f0 = d0*((int8_t) (v0 & 0xf) - 8); - const float f1 = d0*((int8_t) (v0 >> 4) - 8); + const int8_t i0 = (int8_t) (v0 & 0xf) - 8; + const int8_t i1 = (int8_t) (v0 >> 4) - 8; - const float f2 = d1*((int8_t) (v1 & 0xf) - 8); - const float f3 = d1*((int8_t) (v1 >> 4) - 8); + const int8_t i2 = (int8_t) (v1 & 0xf) - 8; + const int8_t i3 = (int8_t) (v1 >> 4) - 8; - sumf += f0*f2 + f1*f3; + sumi += i0*i2 + i1*i3; } + sumf += d0 * d1 * sumi; } #endif From c85980acd04631a7c43d13676276f76ec72f5dfe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:01:22 +0300 Subject: [PATCH 13/17] gitignore : benchmark --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d8dd34fb9..ba5cbf1ed 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ models/* /result /perplexity /embedding +/benchmark-q4_0-matmult /Pipfile arm_neon.h From 9190e8eac8bdc108c40d2d7505e9b45fa773251f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:04:45 +0300 Subject: [PATCH 14/17] llama : merge llama_internal.h into llama.h Hide it behind an #ifdef --- CMakeLists.txt | 1 - Makefile | 2 +- examples/quantize-stats/quantize-stats.cpp | 3 ++- llama.cpp | 1 - llama.h | 11 +++++++++++ llama_internal.h | 12 ------------ 6 files changed, 14 insertions(+), 16 deletions(-) delete mode 100644 llama_internal.h diff --git a/CMakeLists.txt b/CMakeLists.txt index affff3ea1..d5715d92a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,7 +253,6 @@ endif() add_library(llama llama.cpp llama.h - llama_internal.h llama_util.h) target_include_directories(llama PUBLIC .) diff --git a/Makefile b/Makefile index c7ccf462d..7db246650 100644 --- a/Makefile +++ b/Makefile @@ -142,7 +142,7 @@ default: main quantize perplexity embedding ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o -llama.o: llama.cpp llama.h llama_util.h llama_internal.h +llama.o: llama.cpp llama.h llama_util.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o common.o: examples/common.cpp examples/common.h diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 203bfe8cc..c786fe208 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,6 +1,7 @@ #include "ggml.h" + +#define LLAMA_API_INTERNAL #include "llama.h" -#include "llama_internal.h" #include #include diff --git a/llama.cpp b/llama.cpp index 6d8b706b9..c72295684 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5,7 +5,6 @@ #include "llama_util.h" #include "llama.h" -#include "llama_internal.h" #include "ggml.h" diff --git a/llama.h b/llama.h index 7a258a1e1..192217593 100644 --- a/llama.h +++ b/llama.h @@ -179,4 +179,15 @@ extern "C" { } #endif +// Internal API to be implemented by llama.cpp and used by tests/benchmarks only +#ifdef LLAMA_API_INTERNAL + +#include +#include +struct ggml_tensor; + +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif + #endif // LLAMA_H diff --git a/llama_internal.h b/llama_internal.h deleted file mode 100644 index 543eed996..000000000 --- a/llama_internal.h +++ /dev/null @@ -1,12 +0,0 @@ -// Internal header to be included by llama.cpp and tests/benchmarks only. - -#ifndef LLAMA_INTERNAL_H -#define LLAMA_INTERNAL_H - -#include -#include -struct ggml_tensor; - -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); - -#endif // LLAMA_INTERNAL_H From d990e3fffc5b0f5448e90a16c79a4f2675100af0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:32:36 +0300 Subject: [PATCH 15/17] ggml : speed-up ggml_vec_dot_q4_1() ARM_NEON + 32-bit ARM support (#900) * ggml : speed-up q4_1 ARM_NEON by ~5% * ggml : implement vaddvq when missing * ggml : implement vminvq and vmaxvq when missing * ggml : implement vzip when missing * ggml : fix comment * ggml : try to use correct ifdef --- ggml.c | 170 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 125 insertions(+), 45 deletions(-) diff --git a/ggml.c b/ggml.c index eb47d8298..b6a24b40c 100644 --- a/ggml.c +++ b/ggml.c @@ -491,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) } #endif +#if __ARM_NEON + +#if !defined(__aarch64__) + +inline static uint16_t vaddvq_u8(uint8x16_t v) { + return + (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + + (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + + (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + + (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + + (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + + (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + + (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + + (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); +} + +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static uint32_t vaddvq_u16(uint16x8_t v) { + return + (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + + (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + + (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + + (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline float vminvq_f32(float32x4_t v) { + return + MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { + return vget_low_s8(vcombine_s8(a, b)); +} + +inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { + return vget_high_s8(vcombine_s8(a, b)); +} + +inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + return vget_low_u8(vcombine_u8(a, b)); +} + +inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + return vget_high_u8(vcombine_u8(a, b)); +} + +#endif +#endif + // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) @@ -1218,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_MUL vmulq_f32 -#if defined(__ARM_FEATURE_QRDMX) - #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#else - #define GGML_F32x4_REDUCE_ONE(x) \ - (vgetq_lane_f32(x, 0) + \ - vgetq_lane_f32(x, 1) + \ - vgetq_lane_f32(x, 2) + \ - vgetq_lane_f32(x, 3)) -#endif +#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) #define GGML_F32x4_REDUCE(res, x) \ { \ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ @@ -1849,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // 4-bit -> 8-bit const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); // sub 8 const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); #if defined(__ARM_FEATURE_DOTPROD) - // dot product into int16x8_t + // dot product into int32x4_t int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); - // scalar -#if defined(__ARM_FEATURE_QRDMX) - sum0 += x0->d * y0->d * vaddvq_s32(p_0); - sum1 += x1->d * y1->d * vaddvq_s32(p_1); -#else - sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3)); - sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); -#endif + sum0 += x0->d*y0->d*vaddvq_s32(p_0); + sum1 += x1->d*y1->d*vaddvq_s32(p_1); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); @@ -1910,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); - // scalar -#if defined(__ARM_FEATURE_QRDMX) - sum0 += x0->d * y0->d * vaddvq_s16(p_0); - sum1 += x1->d * y1->d * vaddvq_s16(p_1); -#else - sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); - sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); -#endif + sum0 += x0->d*y0->d*vaddvq_s16(p_0); + sum1 += x1->d*y1->d*vaddvq_s16(p_1); #endif } @@ -2265,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest float sum10 = 0.0f; float sum11 = 0.0f; - for (int i = 0; i < nb; ++i) { + for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict y0 = &y[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q4_1 * restrict y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v1_0 = vld1q_u8(y0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + const uint8x16_t v1_1 = vld1q_u8(y1->qs); - // and with 0xf + // 4-bit -> 8-bit const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); - const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); - // dot product into uint16x8_t - const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); - const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); - - const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); - const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); - - const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h); - const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h); + const uint8x16_t v0_1l = vandq_u8(v0_1, m4b); + const uint8x16_t v1_1l = vandq_u8(v1_1, m4b); + const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4); + const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4); sum00 += x0->m*y0->m; sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); - sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0)); + + sum00 += x1->m*y1->m; + sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h)); + sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h)); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l); + int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l); + + p_0 = vdotq_s32(p_0, v0_0h, v1_0h); + p_1 = vdotq_s32(p_1, v0_1h, v1_1h); + + sum11 += x0->d*y0->d*vaddvq_s32(p_0); + sum11 += x1->d*y1->d*vaddvq_s32(p_1); +#else + const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); + const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); + const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); + const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); + + const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l)); + const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l)); + const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h)); + const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h)); + + const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h); + const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h); + + const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h); + const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h); + + const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0); + const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1); + + sum11 += x0->d*y0->d*vaddvq_u16(p_0); + sum11 += x1->d*y1->d*vaddvq_u16(p_1); +#endif } sumf = QK*sum00 + sum01 + sum10 + sum11; From a3a2a0eda8828b60436e9f69d9ac2c1060d03e7a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 13 Apr 2023 18:36:40 +0300 Subject: [PATCH 16/17] ggml : add GGML_DEFAULT_N_THREADS --- ggml.c | 6 +++--- ggml.h | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ggml.c b/ggml.c index b6a24b40c..42e3ee314 100644 --- a/ggml.c +++ b/ggml.c @@ -9363,7 +9363,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ 0, + /*.n_threads =*/ GGML_DEFAULT_N_THREADS, /*.work_size =*/ 0, /*.work =*/ NULL, /*.nodes =*/ { NULL }, @@ -9983,8 +9983,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT("=== GRAPH ===\n"); - GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size); + GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); + GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { diff --git a/ggml.h b/ggml.h index 7d8b7a182..c06c09e06 100644 --- a/ggml.h +++ b/ggml.h @@ -177,11 +177,12 @@ extern "C" { #include #include -#define GGML_MAX_DIMS 4 -#define GGML_MAX_NODES 4096 -#define GGML_MAX_PARAMS 16 -#define GGML_MAX_CONTEXTS 64 -#define GGML_MAX_OPT 4 +#define GGML_MAX_DIMS 4 +#define GGML_MAX_NODES 4096 +#define GGML_MAX_PARAMS 16 +#define GGML_MAX_CONTEXTS 64 +#define GGML_MAX_OPT 4 +#define GGML_DEFAULT_N_THREADS 4 #ifdef __ARM_NEON // we use the built-in 16-bit float type From 0e07e6a8399fd993739a3ba3c6f95f92bfab6f58 Mon Sep 17 00:00:00 2001 From: CRD716 Date: Thu, 13 Apr 2023 10:39:25 -0500 Subject: [PATCH 17/17] common : remove unnecessary includes (#947) --- examples/common.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 91d96efae..0772dbfe1 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -7,12 +7,6 @@ #include #include -#if defined(_MSC_VER) || defined(__MINGW32__) -#include // using malloc.h with MSC/MINGW -#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -#include -#endif - #if defined (_WIN32) #include #include