Merge pull request #1 from ggerganov/master

pull down new code
2023-04-14 12:48:36 +08:00 · 2023-04-14 12:48:36 +08:00 · 18cf46c781
commit 18cf46c781
parent 47d809c692 be87b6ed20
20 changed files with 527 additions and 136 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -14,3 +14,6 @@ indent_size = 4
 [Makefile]
 indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
--- a/.gitignore
+++ b/.gitignore
@ -23,6 +23,7 @@ models/*
 /result
 /perplexity
 /embedding
 /benchmark-q4_0-matmult
 /Pipfile
 arm_neon.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -56,6 +56,10 @@ option(LLAMA_AVX                    "llama: enable AVX"
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
 endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
@ -207,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
            add_compile_options(/arch:AVX)
        endif()
    else()
        if (LLAMA_F16C)
            add_compile_options(-mf16c)
        endif()
        if (LLAMA_FMA)
            add_compile_options(-mfma)
        endif()
@ -247,7 +253,6 @@ endif()
 add_library(llama
            llama.cpp
            llama.h
            llama_internal.h
            llama_util.h)
 target_include_directories(llama PUBLIC .)
--- a/9
+++ b/9
@ -142,14 +142,14 @@ default: main quantize perplexity embedding
 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
-llama.o: llama.cpp llama.h llama_util.h llama_internal.h
+llama.o: llama.cpp llama.h llama_util.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
 main: examples/main/main.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 libllama.so: llama.o ggml.o
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
 #
 # Tests
 #
 benchmark: ggml.o
 	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
 	./benchmark-q4_0-matmult
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
--- a/README.md
+++ b/README.md
@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 **Hot topics:**
 - [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
 - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
 ## Description
@ -48,6 +49,7 @@ New features will probably be added mostly through community contributions.
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 **UI:**
@ -148,21 +150,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 ## Usage
-Here are the step for the LLaMA-7B model:
+Here are the step for the LLaMA-7B model.
 ### Get the Code
 ```bash
 # build this repo
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-make
+```
-#For Windows and CMake, use the following command instead:
+### Build
 cd <path_to_llama_folder>
 mkdir build
 cd build
 cmake ..
 cmake --build . --config Release
 Note: For Windows, CMake or Zig can be used.
 1. Use `make`
    ```bash
    make
    ```
 1. Use CMake
    ```bash
    mkdir build
    cd build
    cmake ..
    cmake --build . --config Release
    ```
 1. Use Zig
    ```bash
    zig build -Drelease-fast
    ```
 ### Prepare Data & Run
 ```bash
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@ -180,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```
 Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 ### Memory/Disk Requirements
--- a/build.zig
+++ b/build.zig
@ -1,16 +1,14 @@
 const std = @import("std");
-pub fn build(b: *std.Build) void {
+pub fn build(b: *std.build.Builder) void {
    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
+    const optimize = b.standardReleaseOptions();
    const want_lto = b.option(bool, "lto", "Want -fLTO");
-    const lib = b.addStaticLibrary(.{
+    const lib = b.addStaticLibrary("llama", null);
        .name = "llama",
        .target = target,
        .optimize = optimize,
    });
    lib.want_lto = want_lto;
    lib.setTarget(target);
    lib.setBuildMode(optimize);
    lib.linkLibCpp();
    lib.addIncludePath(".");
    lib.addIncludePath("examples");
@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void {
 fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
    const b = args.b;
    const lib = args.lib;
    const target = args.target;
    const optimize = args.optimize;
    const want_lto = args.want_lto;
-    const exe = b.addExecutable(.{
+    const exe = b.addExecutable(name, null);
        .name = name,
        .target = target,
        .optimize = optimize,
    });
    exe.want_lto = want_lto;
    lib.setTarget(args.target);
    lib.setBuildMode(args.optimize);
    exe.addIncludePath(".");
    exe.addIncludePath("examples");
    exe.addCSourceFiles(&.{
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@ -7,4 +7,4 @@
 cd `dirname $0`
 cd ..
-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
+./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@ -0,0 +1,270 @@
 /*
    License: MIT License
    Changelog:
    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
 */
 #include <locale.h>
 #include "ggml.h"
 #include <assert.h>
 #include <math.h>
 #include <cstring>
 #include <cstdio>
 #include <cinttypes>
 #include <unordered_map>
 #include <queue>
 #include <string.h>
 #include <cassert>
 #include <fstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
 float tensor_sum_elements(struct ggml_tensor * tensor) {
    float sum = 0;
    if (tensor->type==6) {
        for (int j = 0; j < tensor->ne[1]; j++) {
            for (int k = 0; k < tensor->ne[0]; k++) {
                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
            }
        }
    }
    return sum;
 }
 /*
    These are mapping to unknown
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
    GGML_TYPE_COUNT,
 */
 #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
 #define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
        TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
 struct benchmark_params_struct {
    int32_t n_threads     = 1;
    int32_t n_iterations  = 10;
 };
 void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
    fprintf(stderr, "\n");
 }
 int main(int argc, char ** argv)  {
    struct benchmark_params_struct benchmark_params;
    bool invalid_param = false;
    std::string arg;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
        if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            benchmark_params.n_threads = std::stoi(argv[i]);
        } else if (arg == "-i" || arg == "--iter") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            benchmark_params.n_iterations = std::stoi(argv[i]);
        }  else if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv, benchmark_params);
            exit(0);
        }
        if (invalid_param) {
            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
            print_usage(argc, argv, benchmark_params);
            exit(1);
        }
    }
    // create the ggml context
    printf("Starting Test\n");
    struct ggml_context * ctx;
    //const int sizex = 4096;
    //const int sizey = 11008;
 #undef VERBOSE_DEBUGGING
 #ifndef VERBOSE_DEBUGGING
    const int sizey = 4096;
    const int sizex = 11008;
    const int sizez = 128;
 #else
    /* Working - let's increase size */
    const int sizey = 1;
    const int sizex = (8*32);
    const int sizez = 1;
    /*const int sizey = 1;
    const int sizex = 3*(8*32);
    const int sizez = 1;*/
 #endif
    //printf("Memsize required = %i\n", sizex*sizex);
    ggml_type wtype = GGML_TYPE_F32;
    size_t ctx_size = 0;
    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizeof(float);
    ctx_size += 1024*1024*100;
    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx_size,
        /*.mem_buffer =*/ NULL,
        /* no_alloc   =*/ 0
    };
    ctx = ggml_init(params);
    if (!ctx) {
        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
        return false;
    }
    printf("Creating new tensors\n");
    // printf("Creating new tensor m1\n");
    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
    ggml_set_f32(m11, 1.0f);
    // printf("Creating new tensor m1\n");
    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
    ggml_set_f32(m12, 1.5f);
    // printf("Creating new tensor m2\n");
    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
    ggml_set_f32(m2, 2.0f);
    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
    // printf("Creating new tensor m11xm2\n");
    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
    // printf("Creating compute graph\n");
    struct ggml_cgraph gf = ggml_build_forward(m11xm2);
    gf.n_threads=benchmark_params.n_threads;
    printf("cgraph->n_threads=%i\n",gf.n_threads);
    TENSOR_DUMP(m11);
    TENSOR_DUMP(m2);
    ggml_graph_compute(ctx, &gf);
    TENSOR_DUMP(gf.nodes[0]);
    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
    int32_t nelements = sizex*sizey;
    int32_t ne[2] = { sizex, sizey };
    std::vector<int64_t> hist_cur(1 << 4, 0);
    // Set up a the benchmark matrices
    // printf("Creating new tensor q11 & Running quantize\n");
    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
    // Set up a the compute graph
    // printf("Creating new tensor q31\n");
    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
    // printf("Creating compute graph\n");
    struct ggml_cgraph gf31 = ggml_build_forward(q31);
    gf31.n_threads=benchmark_params.n_threads;
    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
    // printf("Creating new tensor q32\n");
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
    //printf("Creating compute graph\n");
    struct ggml_cgraph gf32 = ggml_build_forward(q32);
    gf32.n_threads=benchmark_params.n_threads;
    printf("cgraph->n_threads=%i\n",gf31.n_threads);
    const int dimx = sizex;
    const int dimy = sizey;
    const int dimz = sizez;
    long long int flops_per_dot_product = dimy + dimy;
    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
    // Let's use the F32 result from above as a reference for the q4_0 multiplication
    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
    printf("==============================================================================================\n");
    for (int i=0;i<benchmark_params.n_iterations ;i++) {
        long long int start = ggml_time_us();
        //printf("Running ggml_graph_compute\n");
        ggml_graph_compute(ctx, &gf31);
        long long int stop = ggml_time_us();
        long long int usec = stop-start;
        float sec = usec/1000000;
        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
            i,
            gf31.n_threads,
            sizex, sizey, sizez, flops_per_matrix,
            usec,flops_per_usec);
 #ifdef VERBOSE_DEBUGGING
        TENSOR_DUMP("res",gf31.nodes[0])
 #endif
        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
        if (delta > allowed_delta)  {
            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
                sum_of_F32_reference,
                sum_of_Q4_result,
                delta,
                allowed_delta
            );
            exit(0);
        }
        // Running a different graph computation to make sure we override the CPU cache lines
        ggml_graph_compute(ctx, &gf32);
    }
 }
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -7,12 +7,6 @@
 #include <iterator>
 #include <algorithm>
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
 #include <alloca.h>
 #endif
 #if defined (_WIN32)
 #include <fcntl.h>
 #include <io.h>
--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@ -10,6 +10,6 @@ cd ..
 ./main --color --instruct --threads 4 \
       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
       --file ./prompts/alpaca.txt \
-       --batch_size 8 --ctx_size 2048 \
+       --batch_size 8 --ctx_size 2048 -n -1 \
       --repeat_last_n 64 --repeat_penalty 1.3 \
       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -27,21 +27,28 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    int count = 0;
    int seq_count = tokens.size() / params.n_ctx;
    int n_vocab = llama_n_vocab(ctx);
    double nll = 0.0;
-
+    fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
    for (int i = 0; i < seq_count; ++i) {
        int start = i * params.n_ctx;
-        int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
+        int end = start + params.n_ctx;
-                                            //       it is better to always be power of 2 for better performance
+
-        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+        std::vector<float> logits;
        int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
        auto start_t = std::chrono::high_resolution_clock::now();
-        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
+        for (int j = 0; j < num_batches; ++j) {
            int batch_start = start + j * params.n_batch;
            int batch_size = std::min(end - batch_start, params.n_batch);
            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
            auto batch_logits = llama_get_logits(ctx);
            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }
        auto end_t = std::chrono::high_resolution_clock::now();
        if (i == 0) {
            const float seconds = std::chrono::duration<float>(end_t - start_t).count();
@ -59,15 +66,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-
+        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
        auto logits = llama_get_logits(ctx);
        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
            // Calculate probability of next token, given the previous ones.
            int n_vocab = llama_n_vocab(ctx);
            std::vector<float> tok_logits(
-                logits + j * n_vocab,
+                logits.begin() + j * n_vocab,
-                logits + (j + 1) * n_vocab);
+                logits.begin() + (j + 1) * n_vocab);
-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+            float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
@ -82,11 +86,13 @@ int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
    params.n_batch = 512;
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }
    params.perplexity = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,6 +1,7 @@
 #include "ggml.h"
 #define LLAMA_API_INTERNAL
 #include "llama.h"
 #include "llama_internal.h"
 #include <algorithm>
 #include <cassert>
--- a/flake.nix
+++ b/flake.nix
@ -28,10 +28,8 @@
          ];
          installPhase = ''
            mkdir -p $out/bin
-            mv bin/main $out/bin/llama
+            mv bin/* $out/bin/
-            mv bin/quantize $out/bin/quantize
+            mv $out/bin/main $out/bin/llama
            mv bin/embedding $out/bin/embedding
            mv bin/perplexity $out/bin/perplexity
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
--- a/ggml.c
+++ b/ggml.c
@ -114,6 +114,14 @@ typedef void* thread_ret_t;
    #define GGML_MEM_ALIGN 16
 #endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
 #else
 #define GGML_ALIGNED_MALLOC(size)  aligned_alloc(GGML_MEM_ALIGN, size)
 #define GGML_ALIGNED_FREE(ptr)     free(ptr)
 #endif
 #define UNUSED(x) (void)(x)
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@ -483,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 }
 #endif
 #if __ARM_NEON
 #if !defined(__aarch64__)
 inline static uint16_t vaddvq_u8(uint8x16_t v) {
    return
        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
 }
 inline static int32_t vaddvq_s16(int16x8_t v) {
    return
        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
 }
 inline static uint32_t vaddvq_u16(uint16x8_t v) {
    return
        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
 }
 inline static int32_t vaddvq_s32(int32x4_t v) {
    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
 inline static float vaddvq_f32(float32x4_t v) {
    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 }
 inline float vminvq_f32(float32x4_t v) {
    return
        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 inline float vmaxvq_f32(float32x4_t v) {
    return
        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
    return vget_low_s8(vcombine_s8(a, b));
 }
 inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
    return vget_high_s8(vcombine_s8(a, b));
 }
 inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
    return vget_low_u8(vcombine_u8(a, b));
 }
 inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
    return vget_high_u8(vcombine_u8(a, b));
 }
 #endif
 #endif
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
@ -1210,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
 #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
 #define GGML_F32x4_ADD          vaddq_f32
 #define GGML_F32x4_MUL          vmulq_f32
-#if defined(__ARM_FEATURE_QRDMX)
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
    #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
 #else
    #define GGML_F32x4_REDUCE_ONE(x) \
    (vgetq_lane_f32(x, 0) +          \
     vgetq_lane_f32(x, 1) +          \
     vgetq_lane_f32(x, 2) +          \
     vgetq_lane_f32(x, 3))
 #endif
 #define GGML_F32x4_REDUCE(res, x)              \
 {                                              \
    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
@ -1841,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
        // 4-bit -> 8-bit
        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
        const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
        const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
        const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
        const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
        // sub 8
        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
        const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
 #if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int16x8_t
+        // dot product into int32x4_t
        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
        p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
        p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
-        // scalar
+        sum0 += x0->d*y0->d*vaddvq_s32(p_0);
-#if defined(__ARM_FEATURE_QRDMX)
+        sum1 += x1->d*y1->d*vaddvq_s32(p_1);
        sum0 += x0->d * y0->d * vaddvq_s32(p_0);
        sum1 += x1->d * y1->d * vaddvq_s32(p_1);
 #else
        sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
        sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
 #endif
 #else
        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
@ -1902,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
        const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
        const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
-        // scalar
+        sum0 += x0->d*y0->d*vaddvq_s16(p_0);
-#if defined(__ARM_FEATURE_QRDMX)
+        sum1 += x1->d*y1->d*vaddvq_s16(p_1);
        sum0 += x0->d * y0->d * vaddvq_s16(p_0);
        sum1 += x1->d * y1->d * vaddvq_s16(p_1);
 #else
        sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
        sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
 #endif
 #endif
    }
@ -2152,18 +2205,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
        const uint8_t * restrict p0 = x[i].qs;
        const uint8_t * restrict p1 = y[i].qs;
        int sumi = 0;
        for (int j = 0; j < QK/2; j++) {
            const uint8_t v0 = p0[j];
            const uint8_t v1 = p1[j];
-            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
+            const int8_t i0 = (int8_t) (v0 & 0xf) - 8;
-            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);
+            const int8_t i1 = (int8_t) (v0 >> 4)  - 8;
-            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
+            const int8_t i2 = (int8_t) (v1 & 0xf) - 8;
-            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);
+            const int8_t i3 = (int8_t) (v1 >> 4)  - 8;
-            sumf += f0*f2 + f1*f3;
+            sumi += i0*i2 + i1*i3;
        }
        sumf += d0 * d1 * sumi;
    }
 #endif
@ -2255,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
    float sum10 = 0.0f;
    float sum11 = 0.0f;
-    for (int i = 0; i < nb; ++i) {
+    for (int i = 0; i < nb; i += 2) {
        const block_q4_1 * restrict x0 = &x[i + 0];
        const block_q4_1 * restrict y0 = &y[i + 0];
        const block_q4_1 * restrict x1 = &x[i + 1];
        const block_q4_1 * restrict y1 = &y[i + 1];
        const uint8x16_t m4b = vdupq_n_u8(0xf);
        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
        const uint8x16_t v1_0 = vld1q_u8(y0->qs);
        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
        const uint8x16_t v1_1 = vld1q_u8(y1->qs);
-        // and with 0xf
+        // 4-bit -> 8-bit
        const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
        const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
        const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
        const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
-        // dot product into uint16x8_t
+        const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
-        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
+        const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
-        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
+        const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
-
+        const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
        const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
        const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
        sum00 += x0->m*y0->m;
        sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
        sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
-        sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
+
        sum00 += x1->m*y1->m;
        sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
        sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
 #if defined(__ARM_FEATURE_DOTPROD)
        // dot product into int32x4_t
        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l);
        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l);
        p_0 = vdotq_s32(p_0, v0_0h, v1_0h);
        p_1 = vdotq_s32(p_1, v0_1h, v1_1h);
        sum11 += x0->d*y0->d*vaddvq_s32(p_0);
        sum11 += x1->d*y1->d*vaddvq_s32(p_1);
 #else
        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
        const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
        const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
        const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
        const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
        const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
        const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);
        const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h);
        const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h);
        const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0);
        const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1);
        sum11 += x0->d*y0->d*vaddvq_u16(p_0);
        sum11 += x1->d*y1->d*vaddvq_u16(p_1);
 #endif
    }
    sumf = QK*sum00 + sum01 + sum10 + sum11;
@ -2966,7 +3056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
    *ctx = (struct ggml_context) {
        /*.mem_size           =*/ params.mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size),
        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
        /*.no_alloc           =*/ params.no_alloc,
        /*.n_objects          =*/ 0,
@ -3001,7 +3091,7 @@ void ggml_free(struct ggml_context * ctx) {
                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
            if (ctx->mem_buffer_owned) {
-                free(ctx->mem_buffer);
+                GGML_ALIGNED_FREE(ctx->mem_buffer);
            }
            found = true;
@ -6435,7 +6525,7 @@ static void ggml_compute_forward_mul_mat_f32(
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                        0.0f,    d, ne01);
            }
        }
@ -6607,7 +6697,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                        0.0f,    d, ne01);
            }
        }
@ -6820,7 +6910,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                        ne11, ne01, ne10,
                        1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                        0.0f,    d, ne01);
            }
        }
@ -9273,7 +9363,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
    struct ggml_cgraph result = {
        /*.n_nodes      =*/ 0,
        /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ 0,
+        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
        /*.work_size    =*/ 0,
        /*.work         =*/ NULL,
        /*.nodes        =*/ { NULL },
@ -9894,7 +9984,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    GGML_PRINT("=== GRAPH ===\n");
    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size);
+    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; i++) {
--- a/ggml.h
+++ b/ggml.h
@ -182,6 +182,7 @@ extern "C" {
 #define GGML_MAX_PARAMS        16
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type
--- a/llama.cpp
+++ b/llama.cpp
@ -5,7 +5,6 @@
 #include "llama_util.h"
 #include "llama.h"
 #include "llama_internal.h"
 #include "ggml.h"
@ -827,7 +826,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
        case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
        case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
-        default: LLAMA_ASSERT(false);
+        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                      return "mostly Q4_1, some F16";
        default:                      return "unknown, may not work";
    }
 }
--- a/llama.h
+++ b/llama.h
@ -71,6 +71,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
    };
    LLAMA_API struct llama_context_params llama_context_default_params();
@ -178,4 +179,15 @@ extern "C" {
 }
 #endif
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif
 #endif // LLAMA_H
--- a/llama_internal.h
+++ b/llama_internal.h
@ -1,12 +0,0 @@
 // Internal header to be included by llama.cpp and tests/benchmarks only.
 #ifndef LLAMA_INTERNAL_H
 #define LLAMA_INTERNAL_H
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif // LLAMA_INTERNAL_H