diff --git a/.editorconfig b/.editorconfig index df8aaf504..135a7e4bc 100644 --- a/.editorconfig +++ b/.editorconfig @@ -14,3 +14,6 @@ indent_size = 4 [Makefile] indent_style = tab + +[prompts/*.txt] +insert_final_newline = unset diff --git a/.gitignore b/.gitignore index d8dd34fb9..ba5cbf1ed 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ models/* /result /perplexity /embedding +/benchmark-q4_0-matmult /Pipfile arm_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bec1f97b..d5715d92a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,10 @@ option(LLAMA_AVX "llama: enable AVX" option(LLAMA_AVX2 "llama: enable AVX2" ON) option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_FMA "llama: enable FMA" ON) +# in MSVC F16C is implied with AVX2/AVX512 +if (NOT MSVC) + option(LLAMA_F16C "llama: enable F16C" ON) +endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) @@ -207,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") add_compile_options(/arch:AVX) endif() else() - add_compile_options(-mf16c) + if (LLAMA_F16C) + add_compile_options(-mf16c) + endif() if (LLAMA_FMA) add_compile_options(-mfma) endif() @@ -247,7 +253,6 @@ endif() add_library(llama llama.cpp llama.h - llama_internal.h llama_util.h) target_include_directories(llama PUBLIC .) diff --git a/Makefile b/Makefile index 3e58a28a7..7db246650 100644 --- a/Makefile +++ b/Makefile @@ -142,14 +142,14 @@ default: main quantize perplexity embedding ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o -llama.o: llama.cpp llama.h llama_util.h llama_internal.h +llama.o: llama.cpp llama.h llama_util.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize quantize-stats perplexity embedding + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o libllama.so: llama.o ggml.o $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) + # # Tests # +benchmark: ggml.o + $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) + ./benchmark-q4_0-matmult + .PHONY: tests tests: bash ./tests/run-tests.sh diff --git a/README.md b/README.md index dbc088532..c88e0de28 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ New features will probably be added mostly through community contributions. - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) +- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) **UI:** @@ -149,21 +150,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the step for the LLaMA-7B model: +Here are the step for the LLaMA-7B model. + +### Get the Code ```bash -# build this repo git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -make +``` -#For Windows and CMake, use the following command instead: -cd -mkdir build -cd build -cmake .. -cmake --build . --config Release +### Build +Note: For Windows, CMake or Zig can be used. + +1. Use `make` + + ```bash + make + ``` + +1. Use CMake + + ```bash + mkdir build + cd build + cmake .. + cmake --build . --config Release + ``` + +1. Use Zig + + ```bash + zig build -Drelease-fast + ``` + +### Prepare Data & Run + +```bash # obtain the original LLaMA model weights and place them in ./models ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model @@ -181,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ``` -Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11. - When running the larger models, make sure you have enough disk space to store all the intermediate files. ### Memory/Disk Requirements diff --git a/build.zig b/build.zig index defc2c3ad..306127ffe 100644 --- a/build.zig +++ b/build.zig @@ -1,16 +1,14 @@ const std = @import("std"); -pub fn build(b: *std.Build) void { +pub fn build(b: *std.build.Builder) void { const target = b.standardTargetOptions(.{}); - const optimize = b.standardOptimizeOption(.{}); + const optimize = b.standardReleaseOptions(); const want_lto = b.option(bool, "lto", "Want -fLTO"); - const lib = b.addStaticLibrary(.{ - .name = "llama", - .target = target, - .optimize = optimize, - }); + const lib = b.addStaticLibrary("llama", null); lib.want_lto = want_lto; + lib.setTarget(target); + lib.setBuildMode(optimize); lib.linkLibCpp(); lib.addIncludePath("."); lib.addIncludePath("examples"); @@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void { fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { const b = args.b; const lib = args.lib; - const target = args.target; - const optimize = args.optimize; const want_lto = args.want_lto; - const exe = b.addExecutable(.{ - .name = name, - .target = target, - .optimize = optimize, - }); + const exe = b.addExecutable(name, null); exe.want_lto = want_lto; + lib.setTarget(args.target); + lib.setBuildMode(args.optimize); exe.addIncludePath("."); exe.addIncludePath("examples"); exe.addCSourceFiles(&.{ diff --git a/examples/alpaca.sh b/examples/alpaca.sh index 4c9aa5077..8d6261730 100755 --- a/examples/alpaca.sh +++ b/examples/alpaca.sh @@ -7,4 +7,4 @@ cd `dirname $0` cd .. -./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 +./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c new file mode 100644 index 000000000..90f537fd8 --- /dev/null +++ b/examples/benchmark/benchmark-q4_0-matmult.c @@ -0,0 +1,270 @@ +/* + License: MIT License + + Changelog: + - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel) + +*/ + +#include +#include "ggml.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +float tensor_sum_elements(struct ggml_tensor * tensor) { + float sum = 0; + if (tensor->type==6) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + sum += ((float *) tensor->data)[j*tensor->ne[0]+k]; + } + } + } + return sum; +} + + +/* + These are mapping to unknown + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_COUNT, +*/ + +#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" + +#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \ + TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ + TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ + { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } + +struct benchmark_params_struct { + int32_t n_threads = 1; + int32_t n_iterations = 10; +}; + +void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); + fprintf(stderr, "\n"); +} + +int main(int argc, char ** argv) { + + + struct benchmark_params_struct benchmark_params; + + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_threads = std::stoi(argv[i]); + } else if (arg == "-i" || arg == "--iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + benchmark_params.n_iterations = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + print_usage(argc, argv, benchmark_params); + exit(0); + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + print_usage(argc, argv, benchmark_params); + exit(1); + } + } + + + // create the ggml context + printf("Starting Test\n"); + + + + struct ggml_context * ctx; + //const int sizex = 4096; + //const int sizey = 11008; + +#undef VERBOSE_DEBUGGING +#ifndef VERBOSE_DEBUGGING + const int sizey = 4096; + const int sizex = 11008; + const int sizez = 128; +#else + /* Working - let's increase size */ + const int sizey = 1; + const int sizex = (8*32); + const int sizez = 1; + + /*const int sizey = 1; + const int sizex = 3*(8*32); + const int sizez = 1;*/ +#endif + + //printf("Memsize required = %i\n", sizex*sizex); + ggml_type wtype = GGML_TYPE_F32; + + size_t ctx_size = 0; + ctx_size += sizex*sizey*ggml_type_sizef(wtype); + ctx_size += sizex*sizey*ggml_type_sizef(wtype); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += sizex*sizeof(float); + ctx_size += 1024*1024*100; + + printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + ctx = ggml_init(params); + if (!ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + + + printf("Creating new tensors\n"); + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m11, 1.0f); + + // printf("Creating new tensor m1\n"); + struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_f32(m12, 1.5f); + + // printf("Creating new tensor m2\n"); + struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); + ggml_set_f32(m2, 2.0f); + + printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n"); + // printf("Creating new tensor m11xm2\n"); + struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph gf = ggml_build_forward(m11xm2); + + gf.n_threads=benchmark_params.n_threads; + printf("cgraph->n_threads=%i\n",gf.n_threads); + + TENSOR_DUMP(m11); + TENSOR_DUMP(m2); + + ggml_graph_compute(ctx, &gf); + + TENSOR_DUMP(gf.nodes[0]); + + printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n"); + + int32_t nelements = sizex*sizey; + int32_t ne[2] = { sizex, sizey }; + + std::vector hist_cur(1 << 4, 0); + + // Set up a the benchmark matrices + // printf("Creating new tensor q11 & Running quantize\n"); + struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); + ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data()); + + // Set up a the compute graph + // printf("Creating new tensor q31\n"); + struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); + + // printf("Creating compute graph\n"); + struct ggml_cgraph gf31 = ggml_build_forward(q31); + gf31.n_threads=benchmark_params.n_threads; + + // Set up a second graph computation to make sure we override the CPU cache lines + // printf("Creating new tensor q12 & Running quantize\n"); + struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey); + ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data()); + + // printf("Creating new tensor q32\n"); + struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); + + //printf("Creating compute graph\n"); + struct ggml_cgraph gf32 = ggml_build_forward(q32); + gf32.n_threads=benchmark_params.n_threads; + printf("cgraph->n_threads=%i\n",gf31.n_threads); + + const int dimx = sizex; + const int dimy = sizey; + const int dimz = sizez; + long long int flops_per_dot_product = dimy + dimy; + long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; + printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); + + + // Let's use the F32 result from above as a reference for the q4_0 multiplication + float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]); + + + printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n"); + printf("==============================================================================================\n"); + + for (int i=0;i allowed_delta) { + printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", + sum_of_F32_reference, + sum_of_Q4_result, + delta, + allowed_delta + ); + exit(0); + } + + // Running a different graph computation to make sure we override the CPU cache lines + ggml_graph_compute(ctx, &gf32); + + } + +} diff --git a/examples/common.cpp b/examples/common.cpp index c0e9ab1f1..7ce4d1fcd 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -7,12 +7,6 @@ #include #include -#if defined(_MSC_VER) || defined(__MINGW32__) -#include // using malloc.h with MSC/MINGW -#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) -#include -#endif - #if defined (_WIN32) #include #include diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh index d974f95a9..5fd739e55 100755 --- a/examples/gpt4all.sh +++ b/examples/gpt4all.sh @@ -10,6 +10,6 @@ cd .. ./main --color --instruct --threads 4 \ --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \ --file ./prompts/alpaca.txt \ - --batch_size 8 --ctx_size 2048 \ + --batch_size 8 --ctx_size 2048 -n -1 \ --repeat_last_n 64 --repeat_penalty 1.3 \ --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95 diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 203bfe8cc..c786fe208 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,6 +1,7 @@ #include "ggml.h" + +#define LLAMA_API_INTERNAL #include "llama.h" -#include "llama_internal.h" #include #include diff --git a/flake.nix b/flake.nix index cd1b6d28e..91d2edd79 100644 --- a/flake.nix +++ b/flake.nix @@ -28,10 +28,8 @@ ]; installPhase = '' mkdir -p $out/bin - mv bin/main $out/bin/llama - mv bin/quantize $out/bin/quantize - mv bin/embedding $out/bin/embedding - mv bin/perplexity $out/bin/perplexity + mv bin/* $out/bin/ + mv $out/bin/main $out/bin/llama echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml diff --git a/ggml.c b/ggml.c index a26b4853f..42e3ee314 100644 --- a/ggml.c +++ b/ggml.c @@ -114,6 +114,14 @@ typedef void* thread_ret_t; #define GGML_MEM_ALIGN 16 #endif +#if defined(_MSC_VER) || defined(__MINGW32__) +#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) +#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#else +#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size) +#define GGML_ALIGNED_FREE(ptr) free(ptr) +#endif + #define UNUSED(x) (void)(x) #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -483,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) } #endif +#if __ARM_NEON + +#if !defined(__aarch64__) + +inline static uint16_t vaddvq_u8(uint8x16_t v) { + return + (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + + (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + + (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + + (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + + (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + + (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + + (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + + (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); +} + +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static uint32_t vaddvq_u16(uint16x8_t v) { + return + (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + + (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + + (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + + (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline float vminvq_f32(float32x4_t v) { + return + MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) { + return vget_low_s8(vcombine_s8(a, b)); +} + +inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) { + return vget_high_s8(vcombine_s8(a, b)); +} + +inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + return vget_low_u8(vcombine_u8(a, b)); +} + +inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + return vget_high_u8(vcombine_u8(a, b)); +} + +#endif +#endif + // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) @@ -1210,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_MUL vmulq_f32 -#if defined(__ARM_FEATURE_QRDMX) - #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#else - #define GGML_F32x4_REDUCE_ONE(x) \ - (vgetq_lane_f32(x, 0) + \ - vgetq_lane_f32(x, 1) + \ - vgetq_lane_f32(x, 2) + \ - vgetq_lane_f32(x, 3)) -#endif +#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) #define GGML_F32x4_REDUCE(res, x) \ { \ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ @@ -1841,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // 4-bit -> 8-bit const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); - const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); - const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); // sub 8 const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); - const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); - const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); #if defined(__ARM_FEATURE_DOTPROD) - // dot product into int16x8_t + // dot product into int32x4_t int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); - // scalar -#if defined(__ARM_FEATURE_QRDMX) - sum0 += x0->d * y0->d * vaddvq_s32(p_0); - sum1 += x1->d * y1->d * vaddvq_s32(p_1); -#else - sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3)); - sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); -#endif + sum0 += x0->d*y0->d*vaddvq_s32(p_0); + sum1 += x1->d*y1->d*vaddvq_s32(p_1); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); @@ -1902,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); - // scalar -#if defined(__ARM_FEATURE_QRDMX) - sum0 += x0->d * y0->d * vaddvq_s16(p_0); - sum1 += x1->d * y1->d * vaddvq_s16(p_1); -#else - sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); - sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); -#endif + sum0 += x0->d*y0->d*vaddvq_s16(p_0); + sum1 += x1->d*y1->d*vaddvq_s16(p_1); #endif } @@ -2152,18 +2205,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const uint8_t * restrict p0 = x[i].qs; const uint8_t * restrict p1 = y[i].qs; + int sumi = 0; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; - const float f0 = d0*((int8_t) (v0 & 0xf) - 8); - const float f1 = d0*((int8_t) (v0 >> 4) - 8); + const int8_t i0 = (int8_t) (v0 & 0xf) - 8; + const int8_t i1 = (int8_t) (v0 >> 4) - 8; - const float f2 = d1*((int8_t) (v1 & 0xf) - 8); - const float f3 = d1*((int8_t) (v1 >> 4) - 8); + const int8_t i2 = (int8_t) (v1 & 0xf) - 8; + const int8_t i3 = (int8_t) (v1 >> 4) - 8; - sumf += f0*f2 + f1*f3; + sumi += i0*i2 + i1*i3; } + sumf += d0 * d1 * sumi; } #endif @@ -2255,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest float sum10 = 0.0f; float sum11 = 0.0f; - for (int i = 0; i < nb; ++i) { + for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict y0 = &y[i + 0]; + const block_q4_1 * restrict x1 = &x[i + 1]; + const block_q4_1 * restrict y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v1_0 = vld1q_u8(y0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + const uint8x16_t v1_1 = vld1q_u8(y1->qs); - // and with 0xf + // 4-bit -> 8-bit const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); - const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); - // dot product into uint16x8_t - const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); - const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); - - const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); - const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); - - const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h); - const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h); + const uint8x16_t v0_1l = vandq_u8(v0_1, m4b); + const uint8x16_t v1_1l = vandq_u8(v1_1, m4b); + const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4); + const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4); sum00 += x0->m*y0->m; sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); - sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0)); + + sum00 += x1->m*y1->m; + sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h)); + sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h)); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l); + int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l); + + p_0 = vdotq_s32(p_0, v0_0h, v1_0h); + p_1 = vdotq_s32(p_1, v0_1h, v1_1h); + + sum11 += x0->d*y0->d*vaddvq_s32(p_0); + sum11 += x1->d*y1->d*vaddvq_s32(p_1); +#else + const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); + const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); + const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); + const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); + + const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l)); + const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l)); + const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h)); + const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h)); + + const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h); + const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h); + + const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h); + const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h); + + const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0); + const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1); + + sum11 += x0->d*y0->d*vaddvq_u16(p_0); + sum11 += x1->d*y1->d*vaddvq_u16(p_1); +#endif } sumf = QK*sum00 + sum01 + sum10 + sum11; @@ -2966,7 +3056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { *ctx = (struct ggml_context) { /*.mem_size =*/ params.mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, @@ -3001,7 +3091,7 @@ void ggml_free(struct ggml_context * ctx) { __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); if (ctx->mem_buffer_owned) { - free(ctx->mem_buffer); + GGML_ALIGNED_FREE(ctx->mem_buffer); } found = true; @@ -6435,7 +6525,7 @@ static void ggml_compute_forward_mul_mat_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } @@ -6607,7 +6697,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } @@ -6820,7 +6910,7 @@ static void ggml_compute_forward_mul_mat_q_f32( cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, - x, ne10, + x, ne00, 0.0f, d, ne01); } } @@ -9273,7 +9363,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ 0, + /*.n_threads =*/ GGML_DEFAULT_N_THREADS, /*.work_size =*/ 0, /*.work =*/ NULL, /*.nodes =*/ { NULL }, @@ -9893,8 +9983,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT("=== GRAPH ===\n"); - GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size); + GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); + GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { diff --git a/ggml.h b/ggml.h index 7d8b7a182..c06c09e06 100644 --- a/ggml.h +++ b/ggml.h @@ -177,11 +177,12 @@ extern "C" { #include #include -#define GGML_MAX_DIMS 4 -#define GGML_MAX_NODES 4096 -#define GGML_MAX_PARAMS 16 -#define GGML_MAX_CONTEXTS 64 -#define GGML_MAX_OPT 4 +#define GGML_MAX_DIMS 4 +#define GGML_MAX_NODES 4096 +#define GGML_MAX_PARAMS 16 +#define GGML_MAX_CONTEXTS 64 +#define GGML_MAX_OPT 4 +#define GGML_DEFAULT_N_THREADS 4 #ifdef __ARM_NEON // we use the built-in 16-bit float type diff --git a/llama.cpp b/llama.cpp index 6d8b706b9..c72295684 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5,7 +5,6 @@ #include "llama_util.h" #include "llama.h" -#include "llama_internal.h" #include "ggml.h" diff --git a/llama.h b/llama.h index 7a258a1e1..192217593 100644 --- a/llama.h +++ b/llama.h @@ -179,4 +179,15 @@ extern "C" { } #endif +// Internal API to be implemented by llama.cpp and used by tests/benchmarks only +#ifdef LLAMA_API_INTERNAL + +#include +#include +struct ggml_tensor; + +std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); + +#endif + #endif // LLAMA_H diff --git a/llama_internal.h b/llama_internal.h deleted file mode 100644 index 543eed996..000000000 --- a/llama_internal.h +++ /dev/null @@ -1,12 +0,0 @@ -// Internal header to be included by llama.cpp and tests/benchmarks only. - -#ifndef LLAMA_INTERNAL_H -#define LLAMA_INTERNAL_H - -#include -#include -struct ggml_tensor; - -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); - -#endif // LLAMA_INTERNAL_H diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt index 009da39ae..ad494d831 100644 --- a/prompts/chat-with-bob.txt +++ b/prompts/chat-with-bob.txt @@ -4,4 +4,4 @@ User: Hello, Bob. Bob: Hello. How may I help you today? User: Please tell me the largest city in Europe. Bob: Sure. The largest city in Europe is Moscow, the capital of Russia. -User: +User: \ No newline at end of file diff --git a/prompts/reason-act.txt b/prompts/reason-act.txt index 872016631..a4f4f4ee6 100644 --- a/prompts/reason-act.txt +++ b/prompts/reason-act.txt @@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333 Question: What is capital of france? Thought: Do I need to use an action? No, I know the answer Answer: Paris is the capital of France -Question: +Question: \ No newline at end of file