From 82d146df9b43cf677e0dbce20b03cf864958a0cc Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Thu, 13 Apr 2023 11:33:16 +0200
Subject: [PATCH 01/34] do not force the prompt file to end with a new line
 (#908)

---
 .editorconfig             | 3 +++
 prompts/chat-with-bob.txt | 2 +-
 prompts/reason-act.txt    | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/.editorconfig b/.editorconfig
index df8aaf504..135a7e4bc 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -14,3 +14,6 @@ indent_size = 4
 
 [Makefile]
 indent_style = tab
+
+[prompts/*.txt]
+insert_final_newline = unset
diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt
index 009da39ae..ad494d831 100644
--- a/prompts/chat-with-bob.txt
+++ b/prompts/chat-with-bob.txt
@@ -4,4 +4,4 @@ User: Hello, Bob.
 Bob: Hello. How may I help you today?
 User: Please tell me the largest city in Europe.
 Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
+User:
\ No newline at end of file
diff --git a/prompts/reason-act.txt b/prompts/reason-act.txt
index 872016631..a4f4f4ee6 100644
--- a/prompts/reason-act.txt
+++ b/prompts/reason-act.txt
@@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333
 Question: What is capital of france?
 Thought: Do I need to use an action? No, I know the answer
 Answer: Paris is the capital of France
-Question:
+Question:
\ No newline at end of file

From 95ea26f6e92d620a5437f576b80868aee7f808d6 Mon Sep 17 00:00:00 2001
From: SebastianApel <13675545+SebastianApel@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:46:23 +0200
Subject: [PATCH 02/34] benchmark : add tool for timing q4_0 matrix
 multiplication (#653)

* Initial version of q4_0 matrix multiplication benchmark

* Bugfix: Added dependency to ggml.o to benchmark

* Reviewer requests: added parameter for threads, switched to ggml_time_us()

* Reviewer input: removed rtsc, use epsilon for check

* Review comment: Removed set_locale

* Feature: Param for numer of iterations, Bugfix for use of parameter threads

* Reviewer suggestion: Moved to examples

* Reviewer feedback: Updated clean: and benchmark: sections

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 Makefile                                    |   7 +-
 examples/benchmark/benchmark-q4_0-matmult.c | 270 ++++++++++++++++++++
 2 files changed, 276 insertions(+), 1 deletion(-)
 create mode 100644 examples/benchmark/benchmark-q4_0-matmult.c

diff --git a/Makefile b/Makefile
index 3e58a28a7..fe2f26ecb 100644
--- a/Makefile
+++ b/Makefile
@@ -149,7 +149,7 @@ common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
 
 main: examples/main/main.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 
 libllama.so: llama.o ggml.o
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+  
 #
 # Tests
 #
 
+benchmark: ggml.o
+	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)	
+	./benchmark-q4_0-matmult
+	
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c
new file mode 100644
index 000000000..9ca9b133a
--- /dev/null
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@@ -0,0 +1,270 @@
+/*
+    License: MIT License
+
+    Changelog:
+    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
+
+*/
+
+#include <locale.h>
+#include "ggml.h"
+#include <assert.h>
+#include <math.h>
+#include <cstring>
+#include <cstdio>
+#include <cinttypes>
+#include <unordered_map>
+#include <queue>
+#include <string.h>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+
+float tensor_sum_elements(struct ggml_tensor * tensor) {
+    float sum = 0;
+    if (tensor->type==6) { 
+        for (int j = 0; j < tensor->ne[1]; j++) { 
+            for (int k = 0; k < tensor->ne[0]; k++) { 
+                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k]; 
+            } 
+        } 
+    }
+    return sum;
+}
+
+
+/*
+    These are mapping to unknown
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,    
+    GGML_TYPE_COUNT,
+*/
+
+#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
+
+#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
+        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
+        TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
+    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
+
+struct benchmark_params_struct {    
+    int32_t n_threads     = 1;
+    int32_t n_iterations  = 10;
+};
+
+void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
+    fprintf(stderr, "\n");
+}
+
+int main(int argc, char ** argv)  {
+
+    
+    struct benchmark_params_struct benchmark_params;
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-i" || arg == "--iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_iterations = std::stoi(argv[i]);
+        }  else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, benchmark_params);
+            exit(0);
+        }     
+        if (invalid_param) {
+            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+            print_usage(argc, argv, benchmark_params);
+            exit(1);
+        }
+    }
+
+
+    // create the ggml context
+    printf("Starting Test\n");
+    
+
+    
+    struct ggml_context * ctx;
+    //const int sizex = 4096;
+    //const int sizey = 11008;
+
+#undef VERBOSE_DEBUGGING
+#ifndef VERBOSE_DEBUGGING
+    const int sizey = 4096;
+    const int sizex = 11008;  
+    const int sizez = 128;
+#else
+    /* Working - let's increase size */
+    const int sizey = 1;
+    const int sizex = (8*32);  
+    const int sizez = 1;
+
+    /*const int sizey = 1;
+    const int sizex = 3*(8*32);  
+    const int sizez = 1;*/
+#endif
+
+    //printf("Memsize required = %i\n", sizex*sizex);
+    ggml_type wtype = GGML_TYPE_F32;    
+    
+    size_t ctx_size = 0;
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizeof(float);
+    ctx_size += 1024*1024*100;    
+    
+    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
+    
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /* no_alloc   =*/ 0
+    };
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+        return false;
+    }
+    
+    
+    printf("Creating new tensors\n");
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m11, 1.0f);
+    
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m12, 1.5f);
+    
+    // printf("Creating new tensor m2\n");
+    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
+    ggml_set_f32(m2, 2.0f);
+    
+    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    // printf("Creating new tensor m11xm2\n");
+    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
+    
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf = ggml_build_forward(m11xm2);
+    
+    gf.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf.n_threads); 
+    
+    TENSOR_DUMP(m11);
+    TENSOR_DUMP(m2);
+    
+    ggml_graph_compute(ctx, &gf);
+
+    TENSOR_DUMP(gf.nodes[0]);
+    
+    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+        
+    int32_t nelements = sizex*sizey;
+    int32_t ne[2] = { sizex, sizey };
+        
+    std::vector<int64_t> hist_cur(1 << 4, 0);    
+
+    // Set up a the benchmark matrices
+    // printf("Creating new tensor q11 & Running quantize\n");
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+    
+    // Set up a the compute graph
+    // printf("Creating new tensor q31\n");
+    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
+        
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf31 = ggml_build_forward(q31);
+    gf31.n_threads=benchmark_params.n_threads;
+    
+    // Set up a second graph computation to make sure we override the CPU cache lines    
+    // printf("Creating new tensor q12 & Running quantize\n");
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+
+    // printf("Creating new tensor q32\n");
+    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
+        
+    //printf("Creating compute graph\n");
+    struct ggml_cgraph gf32 = ggml_build_forward(q32);
+    gf32.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf31.n_threads); 
+    
+    const int dimx = sizex;
+    const int dimy = sizey;
+    const int dimz = sizez;
+    long long int flops_per_dot_product = dimy + dimy;
+    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+   
+
+    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+    
+
+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
+    printf("==============================================================================================\n");
+    
+    for (int i=0;i<benchmark_params.n_iterations ;i++) {
+    
+        long long int start = ggml_time_us();
+        //printf("Running ggml_graph_compute\n");
+        ggml_graph_compute(ctx, &gf31);
+        long long int stop = ggml_time_us();
+        long long int usec = stop-start;
+        float sec = usec/1000000;
+        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
+            i,
+            gf31.n_threads, 
+            sizex, sizey, sizez, flops_per_matrix, 
+            usec,flops_per_usec);
+
+#ifdef VERBOSE_DEBUGGING
+        TENSOR_DUMP("res",gf31.nodes[0])
+#endif
+
+        // Check that the matrix multiplication result is in the right ballpark        
+        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
+        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
+        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
+
+        if (delta > allowed_delta)  {
+            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
+                sum_of_F32_reference, 
+                sum_of_Q4_result,
+                delta,
+                allowed_delta
+            );
+            exit(0);
+        }
+        
+        // Running a different graph computation to make sure we override the CPU cache lines    
+        ggml_graph_compute(ctx, &gf32);
+        
+    }
+    
+}

From 585d91a156794d30eec16ebe67c8d7a1d41406c1 Mon Sep 17 00:00:00 2001
From: anzz1 <anzz1@live.com>
Date: Thu, 13 Apr 2023 15:48:21 +0300
Subject: [PATCH 03/34] cmake : add explicit F16C option (x86) (#576)

Fixes building for x86 processors missing F16C featureset
MSVC not included, as in MSVC F16C is implied with AVX2/AVX512
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6bec1f97b..affff3ea1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,10 @@ option(LLAMA_AVX                    "llama: enable AVX"
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
+# in MSVC F16C is implied with AVX2/AVX512
+if (NOT MSVC)
+    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
+endif()
 
 # 3rd party libs
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
@@ -207,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
             add_compile_options(/arch:AVX)
         endif()
     else()
-        add_compile_options(-mf16c)
+        if (LLAMA_F16C)
+            add_compile_options(-mf16c)
+        endif()
         if (LLAMA_FMA)
             add_compile_options(-mfma)
         endif()

From 107980d970808c2ccf9334ad033e2782a560b911 Mon Sep 17 00:00:00 2001
From: niansa/tuxifan <anton-sa@web.de>
Date: Thu, 13 Apr 2023 15:03:39 +0200
Subject: [PATCH 04/34] examples : add -n to alpaca and gpt4all scripts (#706)

---
 examples/alpaca.sh  | 2 +-
 examples/gpt4all.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/alpaca.sh b/examples/alpaca.sh
index 4c9aa5077..8d6261730 100755
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -7,4 +7,4 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
+./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
diff --git a/examples/gpt4all.sh b/examples/gpt4all.sh
index d974f95a9..5fd739e55 100755
--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -10,6 +10,6 @@ cd ..
 ./main --color --instruct --threads 4 \
        --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
        --file ./prompts/alpaca.txt \
-       --batch_size 8 --ctx_size 2048 \
+       --batch_size 8 --ctx_size 2048 -n -1 \
        --repeat_last_n 64 --repeat_penalty 1.3 \
        --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

From 8c3ffc2f048a372639906fb30ec3c2070288d3be Mon Sep 17 00:00:00 2001
From: Vladimir <bogdad@gmail.com>
Date: Thu, 13 Apr 2023 15:24:30 +0200
Subject: [PATCH 05/34] ggml : update cblas_sgemm columns var to be more
 reasonable (#838)

---
 ggml.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index a26b4853f..546da30d1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6435,7 +6435,7 @@ static void ggml_compute_forward_mul_mat_f32(
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                         0.0f,    d, ne01);
             }
         }
@@ -6607,7 +6607,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                         0.0f,    d, ne01);
             }
         }
@@ -6820,7 +6820,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
-                                 x, ne10,
+                                 x, ne00,
                         0.0f,    d, ne01);
             }
         }

From 4579af95e8e16910f6dbab0994917a5b3901f0cf Mon Sep 17 00:00:00 2001
From: Judd <foldl@users.noreply.github.com>
Date: Thu, 13 Apr 2023 21:43:22 +0800
Subject: [PATCH 06/34] zig : update build.zig (#872)

* update

* update readme

* minimize the changes.

---------

Co-authored-by: zjli2019 <zhengji.li@ingchips.com>
---
 README.md | 40 +++++++++++++++++++++++++++++++---------
 build.zig | 22 ++++++++--------------
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index dbc088532..c0958ebd6 100644
--- a/README.md
+++ b/README.md
@@ -149,21 +149,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 
 ## Usage
 
-Here are the step for the LLaMA-7B model:
+Here are the step for the LLaMA-7B model.
+
+### Get the Code
 
 ```bash
-# build this repo
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-make
+```
 
-#For Windows and CMake, use the following command instead:
-cd <path_to_llama_folder>
-mkdir build
-cd build
-cmake ..
-cmake --build . --config Release
+### Build
 
+Note: For Windows, CMake or Zig can be used.
+
+1. Use `make`
+
+    ```bash
+    make
+    ```
+
+1. Use CMake
+
+    ```bash
+    mkdir build
+    cd build
+    cmake ..
+    cmake --build . --config Release
+    ```
+
+1. Use Zig
+
+    ```bash
+    zig build -Drelease-fast
+    ```
+
+### Prepare Data & Run
+
+```bash
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
diff --git a/build.zig b/build.zig
index defc2c3ad..306127ffe 100644
--- a/build.zig
+++ b/build.zig
@@ -1,16 +1,14 @@
 const std = @import("std");
 
-pub fn build(b: *std.Build) void {
+pub fn build(b: *std.build.Builder) void {
     const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
+    const optimize = b.standardReleaseOptions();
     const want_lto = b.option(bool, "lto", "Want -fLTO");
 
-    const lib = b.addStaticLibrary(.{
-        .name = "llama",
-        .target = target,
-        .optimize = optimize,
-    });
+    const lib = b.addStaticLibrary("llama", null);
     lib.want_lto = want_lto;
+    lib.setTarget(target);
+    lib.setBuildMode(optimize);
     lib.linkLibCpp();
     lib.addIncludePath(".");
     lib.addIncludePath("examples");
@@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void {
 fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
     const b = args.b;
     const lib = args.lib;
-    const target = args.target;
-    const optimize = args.optimize;
     const want_lto = args.want_lto;
 
-    const exe = b.addExecutable(.{
-        .name = name,
-        .target = target,
-        .optimize = optimize,
-    });
+    const exe = b.addExecutable(name, null);
     exe.want_lto = want_lto;
+    lib.setTarget(args.target);
+    lib.setBuildMode(args.optimize);
     exe.addIncludePath(".");
     exe.addIncludePath("examples");
     exe.addCSourceFiles(&.{

From c729ff730a46a135817a3d9988a097e3678a9722 Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Thu, 13 Apr 2023 15:49:05 +0200
Subject: [PATCH 07/34] flake.nix: add all binaries from bin (#848)

---
 flake.nix | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/flake.nix b/flake.nix
index cd1b6d28e..91d2edd79 100644
--- a/flake.nix
+++ b/flake.nix
@@ -28,10 +28,8 @@
           ];
           installPhase = ''
             mkdir -p $out/bin
-            mv bin/main $out/bin/llama
-            mv bin/quantize $out/bin/quantize
-            mv bin/embedding $out/bin/embedding
-            mv bin/perplexity $out/bin/perplexity
+            mv bin/* $out/bin/
+            mv $out/bin/main $out/bin/llama
 
             echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
             cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml

From 7e941b95eba067cb5b92785e642fd803657376ee Mon Sep 17 00:00:00 2001
From: "Genkagaku.GPT" <hlhr202@163.com>
Date: Thu, 13 Apr 2023 21:54:27 +0800
Subject: [PATCH 08/34] readme : llama node binding (#911)

* chore: add nodejs binding

* chore: add nodejs binding
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c0958ebd6..a7f220eb2 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,7 @@ New features will probably be added mostly through community contributions.
 
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 
 **UI:**
 

From ec29272175d7a79681d9919f3e755b1bcefa0478 Mon Sep 17 00:00:00 2001
From: CRD716 <crd716@gmail.com>
Date: Thu, 13 Apr 2023 08:59:53 -0500
Subject: [PATCH 09/34] readme : remove python 3.10 warning (#929)

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index a7f220eb2..c88e0de28 100644
--- a/README.md
+++ b/README.md
@@ -204,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```
 
-Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
-
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 
 ### Memory/Disk Requirements

From 8cda5c981d0bf4dcb7664194b2cb9a06e2dbdd54 Mon Sep 17 00:00:00 2001
From: CRD716 <crd716@gmail.com>
Date: Thu, 13 Apr 2023 09:03:57 -0500
Subject: [PATCH 10/34] fix whitespace (#944)

---
 Makefile                                    |   6 +-
 examples/benchmark/benchmark-q4_0-matmult.c | 106 ++++++++++----------
 2 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/Makefile b/Makefile
index fe2f26ecb..c7ccf462d 100644
--- a/Makefile
+++ b/Makefile
@@ -171,15 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 
 libllama.so: llama.o ggml.o
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
-  
+
 #
 # Tests
 #
 
 benchmark: ggml.o
-	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)	
+	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
 	./benchmark-q4_0-matmult
-	
+
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c
index 9ca9b133a..90f537fd8 100644
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@@ -24,12 +24,12 @@
 
 float tensor_sum_elements(struct ggml_tensor * tensor) {
     float sum = 0;
-    if (tensor->type==6) { 
-        for (int j = 0; j < tensor->ne[1]; j++) { 
-            for (int k = 0; k < tensor->ne[0]; k++) { 
-                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k]; 
-            } 
-        } 
+    if (tensor->type==6) {
+        for (int j = 0; j < tensor->ne[1]; j++) {
+            for (int k = 0; k < tensor->ne[0]; k++) {
+                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
+            }
+        }
     }
     return sum;
 }
@@ -39,7 +39,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
     These are mapping to unknown
     GGML_TYPE_I8,
     GGML_TYPE_I16,
-    GGML_TYPE_I32,    
+    GGML_TYPE_I32,
     GGML_TYPE_COUNT,
 */
 
@@ -50,7 +50,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
         TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
     { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
 
-struct benchmark_params_struct {    
+struct benchmark_params_struct {
     int32_t n_threads     = 1;
     int32_t n_iterations  = 10;
 };
@@ -67,7 +67,7 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
 
 int main(int argc, char ** argv)  {
 
-    
+
     struct benchmark_params_struct benchmark_params;
 
     bool invalid_param = false;
@@ -90,7 +90,7 @@ int main(int argc, char ** argv)  {
         }  else if (arg == "-h" || arg == "--help") {
             print_usage(argc, argv, benchmark_params);
             exit(0);
-        }     
+        }
         if (invalid_param) {
             fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
             print_usage(argc, argv, benchmark_params);
@@ -101,9 +101,9 @@ int main(int argc, char ** argv)  {
 
     // create the ggml context
     printf("Starting Test\n");
-    
 
-    
+
+
     struct ggml_context * ctx;
     //const int sizex = 4096;
     //const int sizey = 11008;
@@ -111,31 +111,31 @@ int main(int argc, char ** argv)  {
 #undef VERBOSE_DEBUGGING
 #ifndef VERBOSE_DEBUGGING
     const int sizey = 4096;
-    const int sizex = 11008;  
+    const int sizex = 11008;
     const int sizez = 128;
 #else
     /* Working - let's increase size */
     const int sizey = 1;
-    const int sizex = (8*32);  
+    const int sizex = (8*32);
     const int sizez = 1;
 
     /*const int sizey = 1;
-    const int sizex = 3*(8*32);  
+    const int sizex = 3*(8*32);
     const int sizez = 1;*/
 #endif
 
     //printf("Memsize required = %i\n", sizex*sizex);
-    ggml_type wtype = GGML_TYPE_F32;    
-    
+    ggml_type wtype = GGML_TYPE_F32;
+
     size_t ctx_size = 0;
     ctx_size += sizex*sizey*ggml_type_sizef(wtype);
     ctx_size += sizex*sizey*ggml_type_sizef(wtype);
     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
     ctx_size += sizex*sizeof(float);
-    ctx_size += 1024*1024*100;    
-    
+    ctx_size += 1024*1024*100;
+
     printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
-    
+
     struct ggml_init_params params = {
         /*.mem_size   =*/ ctx_size,
         /*.mem_buffer =*/ NULL,
@@ -147,88 +147,88 @@ int main(int argc, char ** argv)  {
         fprintf(stderr, "%s: ggml_init() failed\n", __func__);
         return false;
     }
-    
-    
+
+
     printf("Creating new tensors\n");
     // printf("Creating new tensor m1\n");
     struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
     ggml_set_f32(m11, 1.0f);
-    
+
     // printf("Creating new tensor m1\n");
     struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
     ggml_set_f32(m12, 1.5f);
-    
+
     // printf("Creating new tensor m2\n");
     struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
     ggml_set_f32(m2, 2.0f);
-    
+
     printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
     // printf("Creating new tensor m11xm2\n");
     struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
-    
+
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf = ggml_build_forward(m11xm2);
-    
+
     gf.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf.n_threads); 
-    
+    printf("cgraph->n_threads=%i\n",gf.n_threads);
+
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
-    
+
     ggml_graph_compute(ctx, &gf);
 
     TENSOR_DUMP(gf.nodes[0]);
-    
+
     printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
-        
+
     int32_t nelements = sizex*sizey;
     int32_t ne[2] = { sizex, sizey };
-        
-    std::vector<int64_t> hist_cur(1 << 4, 0);    
+
+    std::vector<int64_t> hist_cur(1 << 4, 0);
 
     // Set up a the benchmark matrices
     // printf("Creating new tensor q11 & Running quantize\n");
     struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
     ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
-    
+
     // Set up a the compute graph
     // printf("Creating new tensor q31\n");
     struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
-        
+
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf31 = ggml_build_forward(q31);
     gf31.n_threads=benchmark_params.n_threads;
-    
-    // Set up a second graph computation to make sure we override the CPU cache lines    
+
+    // Set up a second graph computation to make sure we override the CPU cache lines
     // printf("Creating new tensor q12 & Running quantize\n");
     struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
     ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
 
     // printf("Creating new tensor q32\n");
     struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
-        
+
     //printf("Creating compute graph\n");
     struct ggml_cgraph gf32 = ggml_build_forward(q32);
     gf32.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf31.n_threads); 
-    
+    printf("cgraph->n_threads=%i\n",gf31.n_threads);
+
     const int dimx = sizex;
     const int dimy = sizey;
     const int dimz = sizez;
     long long int flops_per_dot_product = dimy + dimy;
     long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
     printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
-   
+
 
     // Let's use the F32 result from above as a reference for the q4_0 multiplication
     float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
-    
+
 
     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
     printf("==============================================================================================\n");
-    
+
     for (int i=0;i<benchmark_params.n_iterations ;i++) {
-    
+
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
         ggml_graph_compute(ctx, &gf31);
@@ -238,15 +238,15 @@ int main(int argc, char ** argv)  {
         float flops_per_usec = (1.0f*flops_per_matrix)/usec;
         printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
             i,
-            gf31.n_threads, 
-            sizex, sizey, sizez, flops_per_matrix, 
+            gf31.n_threads,
+            sizex, sizey, sizez, flops_per_matrix,
             usec,flops_per_usec);
 
 #ifdef VERBOSE_DEBUGGING
         TENSOR_DUMP("res",gf31.nodes[0])
 #endif
 
-        // Check that the matrix multiplication result is in the right ballpark        
+        // Check that the matrix multiplication result is in the right ballpark
         // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
         float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
         float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
@@ -254,17 +254,17 @@ int main(int argc, char ** argv)  {
 
         if (delta > allowed_delta)  {
             printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
-                sum_of_F32_reference, 
+                sum_of_F32_reference,
                 sum_of_Q4_result,
                 delta,
                 allowed_delta
             );
             exit(0);
         }
-        
-        // Running a different graph computation to make sure we override the CPU cache lines    
+
+        // Running a different graph computation to make sure we override the CPU cache lines
         ggml_graph_compute(ctx, &gf32);
-        
+
     }
-    
+
 }

From 6c248707f51c8a50f7792e7f7787ec481881db88 Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Thu, 13 Apr 2023 16:08:32 +0200
Subject: [PATCH 11/34] ggml : introduce GGML_ALIGNED_MALLOC/GGML_ALIGNED_FREE
 macros (#884)

which allows us to use aligned_alloc or _aligned_malloc functions
---
 ggml.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 546da30d1..281fd8ec9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -114,6 +114,14 @@ typedef void* thread_ret_t;
     #define GGML_MEM_ALIGN 16
 #endif
 
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
+#define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
+#else
+#define GGML_ALIGNED_MALLOC(size)  aligned_alloc(GGML_MEM_ALIGN, size)
+#define GGML_ALIGNED_FREE(ptr)     free(ptr)
+#endif
+
 #define UNUSED(x) (void)(x)
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
 
@@ -2966,7 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     *ctx = (struct ggml_context) {
         /*.mem_size           =*/ params.mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size),
         /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
         /*.no_alloc           =*/ params.no_alloc,
         /*.n_objects          =*/ 0,
@@ -3001,7 +3009,7 @@ void ggml_free(struct ggml_context * ctx) {
                     __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
 
             if (ctx->mem_buffer_owned) {
-                free(ctx->mem_buffer);
+                GGML_ALIGNED_FREE(ctx->mem_buffer);
             }
 
             found = true;

From 6232f2d7fd7a22d5eeb62182b2f21fcf01359754 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Thu, 13 Apr 2023 14:59:50 +0000
Subject: [PATCH 12/34] ggml : optimize non-SIMD Q4_0 vector dot product (#703)

---
 ggml.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 281fd8ec9..eb47d8298 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2160,18 +2160,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
         const uint8_t * restrict p0 = x[i].qs;
         const uint8_t * restrict p1 = y[i].qs;
 
+        int sumi = 0;
         for (int j = 0; j < QK/2; j++) {
             const uint8_t v0 = p0[j];
             const uint8_t v1 = p1[j];
 
-            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
-            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);
+            const int8_t i0 = (int8_t) (v0 & 0xf) - 8;
+            const int8_t i1 = (int8_t) (v0 >> 4)  - 8;
 
-            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
-            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);
+            const int8_t i2 = (int8_t) (v1 & 0xf) - 8;
+            const int8_t i3 = (int8_t) (v1 >> 4)  - 8;
 
-            sumf += f0*f2 + f1*f3;
+            sumi += i0*i2 + i1*i3;
         }
+        sumf += d0 * d1 * sumi;
     }
 #endif
 

From c85980acd04631a7c43d13676276f76ec72f5dfe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Apr 2023 18:01:22 +0300
Subject: [PATCH 13/34] gitignore : benchmark

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index d8dd34fb9..ba5cbf1ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,6 +23,7 @@ models/*
 /result
 /perplexity
 /embedding
+/benchmark-q4_0-matmult
 /Pipfile
 
 arm_neon.h

From 9190e8eac8bdc108c40d2d7505e9b45fa773251f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Apr 2023 18:04:45 +0300
Subject: [PATCH 14/34] llama : merge llama_internal.h into llama.h

Hide it behind an #ifdef
---
 CMakeLists.txt                             |  1 -
 Makefile                                   |  2 +-
 examples/quantize-stats/quantize-stats.cpp |  3 ++-
 llama.cpp                                  |  1 -
 llama.h                                    | 11 +++++++++++
 llama_internal.h                           | 12 ------------
 6 files changed, 14 insertions(+), 16 deletions(-)
 delete mode 100644 llama_internal.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index affff3ea1..d5715d92a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -253,7 +253,6 @@ endif()
 add_library(llama
             llama.cpp
             llama.h
-            llama_internal.h
             llama_util.h)
 
 target_include_directories(llama PUBLIC .)
diff --git a/Makefile b/Makefile
index c7ccf462d..7db246650 100644
--- a/Makefile
+++ b/Makefile
@@ -142,7 +142,7 @@ default: main quantize perplexity embedding
 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
 
-llama.o: llama.cpp llama.h llama_util.h llama_internal.h
+llama.o: llama.cpp llama.h llama_util.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
 
 common.o: examples/common.cpp examples/common.h
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 203bfe8cc..c786fe208 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
+
+#define LLAMA_API_INTERNAL
 #include "llama.h"
-#include "llama_internal.h"
 
 #include <algorithm>
 #include <cassert>
diff --git a/llama.cpp b/llama.cpp
index 6d8b706b9..c72295684 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5,7 +5,6 @@
 
 #include "llama_util.h"
 #include "llama.h"
-#include "llama_internal.h"
 
 #include "ggml.h"
 
diff --git a/llama.h b/llama.h
index 7a258a1e1..192217593 100644
--- a/llama.h
+++ b/llama.h
@@ -179,4 +179,15 @@ extern "C" {
 }
 #endif
 
+// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
+#ifdef LLAMA_API_INTERNAL
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif
+
 #endif // LLAMA_H
diff --git a/llama_internal.h b/llama_internal.h
deleted file mode 100644
index 543eed996..000000000
--- a/llama_internal.h
+++ /dev/null
@@ -1,12 +0,0 @@
-// Internal header to be included by llama.cpp and tests/benchmarks only.
-
-#ifndef LLAMA_INTERNAL_H
-#define LLAMA_INTERNAL_H
-
-#include <vector>
-#include <string>
-struct ggml_tensor;
-
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-
-#endif // LLAMA_INTERNAL_H

From d990e3fffc5b0f5448e90a16c79a4f2675100af0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Apr 2023 18:32:36 +0300
Subject: [PATCH 15/34] ggml : speed-up ggml_vec_dot_q4_1() ARM_NEON + 32-bit
 ARM support (#900)

* ggml : speed-up q4_1 ARM_NEON by ~5%

* ggml : implement vaddvq when missing

* ggml : implement vminvq and vmaxvq when missing

* ggml : implement vzip when missing

* ggml : fix comment

* ggml : try to use correct ifdef
---
 ggml.c | 170 ++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 125 insertions(+), 45 deletions(-)

diff --git a/ggml.c b/ggml.c
index eb47d8298..b6a24b40c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -491,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 }
 #endif
 
+#if __ARM_NEON
+
+#if !defined(__aarch64__)
+
+inline static uint16_t vaddvq_u8(uint8x16_t v) {
+    return
+        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
+        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
+        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
+        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
+        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
+        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
+        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
+        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
+}
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+    return
+        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static uint32_t vaddvq_u16(uint16x8_t v) {
+    return
+        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
+        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
+        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
+        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline float vminvq_f32(float32x4_t v) {
+    return
+        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
+    return vget_low_s8(vcombine_s8(a, b));
+}
+
+inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
+    return vget_high_s8(vcombine_s8(a, b));
+}
+
+inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+    return vget_low_u8(vcombine_u8(a, b));
+}
+
+inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+    return vget_high_u8(vcombine_u8(a, b));
+}
+
+#endif
+#endif
+
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
@@ -1218,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
 #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
 #define GGML_F32x4_ADD          vaddq_f32
 #define GGML_F32x4_MUL          vmulq_f32
-#if defined(__ARM_FEATURE_QRDMX)
-    #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#else
-    #define GGML_F32x4_REDUCE_ONE(x) \
-    (vgetq_lane_f32(x, 0) +          \
-     vgetq_lane_f32(x, 1) +          \
-     vgetq_lane_f32(x, 2) +          \
-     vgetq_lane_f32(x, 3))
-#endif
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
 #define GGML_F32x4_REDUCE(res, x)              \
 {                                              \
     for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
@@ -1849,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
         // 4-bit -> 8-bit
         const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
         const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
-
         const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
         const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
 
         const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
         const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
-
         const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
         const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
 
         // sub 8
         const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
         const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
-
         const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
         const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
 
         const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
         const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
-
         const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
         const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
 
 #if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int16x8_t
+        // dot product into int32x4_t
         int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
         int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
 
         p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
         p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
 
-        // scalar
-#if defined(__ARM_FEATURE_QRDMX)
-        sum0 += x0->d * y0->d * vaddvq_s32(p_0);
-        sum1 += x1->d * y1->d * vaddvq_s32(p_1);
-#else
-        sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
-        sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
-#endif
+        sum0 += x0->d*y0->d*vaddvq_s32(p_0);
+        sum1 += x1->d*y1->d*vaddvq_s32(p_1);
 #else
         const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
         const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
-
         const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
         const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
 
         const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
         const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
-
         const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
         const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
 
@@ -1910,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
         const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
         const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
 
-        // scalar
-#if defined(__ARM_FEATURE_QRDMX)
-        sum0 += x0->d * y0->d * vaddvq_s16(p_0);
-        sum1 += x1->d * y1->d * vaddvq_s16(p_1);
-#else
-        sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
-        sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
-#endif
+        sum0 += x0->d*y0->d*vaddvq_s16(p_0);
+        sum1 += x1->d*y1->d*vaddvq_s16(p_1);
 #endif
     }
 
@@ -2265,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
     float sum10 = 0.0f;
     float sum11 = 0.0f;
 
-    for (int i = 0; i < nb; ++i) {
+    for (int i = 0; i < nb; i += 2) {
         const block_q4_1 * restrict x0 = &x[i + 0];
         const block_q4_1 * restrict y0 = &y[i + 0];
+        const block_q4_1 * restrict x1 = &x[i + 1];
+        const block_q4_1 * restrict y1 = &y[i + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0xf);
 
         const uint8x16_t v0_0 = vld1q_u8(x0->qs);
         const uint8x16_t v1_0 = vld1q_u8(y0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+        const uint8x16_t v1_1 = vld1q_u8(y1->qs);
 
-        // and with 0xf
+        // 4-bit -> 8-bit
         const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
         const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
-
         const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
         const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
 
-        // dot product into uint16x8_t
-        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
-        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
-
-        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
-        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
-
-        const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
-        const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
+        const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
+        const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
+        const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
+        const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
 
         sum00 += x0->m*y0->m;
         sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
         sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
-        sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
+
+        sum00 += x1->m*y1->m;
+        sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
+        sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l);
+        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l);
+
+        p_0 = vdotq_s32(p_0, v0_0h, v1_0h);
+        p_1 = vdotq_s32(p_1, v0_1h, v1_1h);
+
+        sum11 += x0->d*y0->d*vaddvq_s32(p_0);
+        sum11 += x1->d*y1->d*vaddvq_s32(p_1);
+#else
+        const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
+        const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
+        const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
+        const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
+
+        const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
+        const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
+        const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
+        const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
+
+        const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
+        const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);
+
+        const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h);
+        const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h);
+
+        const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0);
+        const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1);
+
+        sum11 += x0->d*y0->d*vaddvq_u16(p_0);
+        sum11 += x1->d*y1->d*vaddvq_u16(p_1);
+#endif
     }
 
     sumf = QK*sum00 + sum01 + sum10 + sum11;

From a3a2a0eda8828b60436e9f69d9ac2c1060d03e7a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 13 Apr 2023 18:36:40 +0300
Subject: [PATCH 16/34] ggml : add GGML_DEFAULT_N_THREADS

---
 ggml.c |  6 +++---
 ggml.h | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/ggml.c b/ggml.c
index b6a24b40c..42e3ee314 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9363,7 +9363,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     struct ggml_cgraph result = {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ 0,
+        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
         /*.work_size    =*/ 0,
         /*.work         =*/ NULL,
         /*.nodes        =*/ { NULL },
@@ -9983,8 +9983,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
 
     GGML_PRINT("=== GRAPH ===\n");
 
-    GGML_PRINT_DEBUG("n_threads       = %d\n",       cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size);
+    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
+    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
 
     GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
diff --git a/ggml.h b/ggml.h
index 7d8b7a182..c06c09e06 100644
--- a/ggml.h
+++ b/ggml.h
@@ -177,11 +177,12 @@ extern "C" {
 #include <stddef.h>
 #include <stdbool.h>
 
-#define GGML_MAX_DIMS     4
-#define GGML_MAX_NODES    4096
-#define GGML_MAX_PARAMS   16
-#define GGML_MAX_CONTEXTS 64
-#define GGML_MAX_OPT      4
+#define GGML_MAX_DIMS          4
+#define GGML_MAX_NODES         4096
+#define GGML_MAX_PARAMS        16
+#define GGML_MAX_CONTEXTS      64
+#define GGML_MAX_OPT           4
+#define GGML_DEFAULT_N_THREADS 4
 
 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type

From 0e07e6a8399fd993739a3ba3c6f95f92bfab6f58 Mon Sep 17 00:00:00 2001
From: CRD716 <crd716@gmail.com>
Date: Thu, 13 Apr 2023 10:39:25 -0500
Subject: [PATCH 17/34] common : remove unnecessary includes (#947)

---
 examples/common.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 91d96efae..0772dbfe1 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -7,12 +7,6 @@
 #include <iterator>
 #include <algorithm>
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-#include <alloca.h>
-#endif
-
 #if defined (_WIN32)
 #include <fcntl.h>
 #include <io.h>

From be87b6ed20a5f7528bf491a83e759a9fc6a24fea Mon Sep 17 00:00:00 2001
From: Gary Linscott <glinscott@gmail.com>
Date: Thu, 13 Apr 2023 14:50:42 -0700
Subject: [PATCH 18/34] perplexity : add support for batch size to
 `--perplexity` (#407)

* Add support to batch size for perplexity

* Revert "Fix memory allocation issues and seg faults"

This reverts commit 4870e455b3653f7d7769fa5772b2c90ffad088df.

* update from merge

* Remove perplexity from main

* updates

* Update batch size for efficiency
---
 examples/perplexity/perplexity.cpp | 36 +++++++++++++++++-------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b62f00d0c..38e3643b1 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -27,20 +27,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
 
     int count = 0;
     int seq_count = tokens.size() / params.n_ctx;
+    int n_vocab = llama_n_vocab(ctx);
 
     double nll = 0.0;
-
-    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
+    fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
 
     for (int i = 0; i < seq_count; ++i) {
         int start = i * params.n_ctx;
-        int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
-                                            //       it is better to always be power of 2 for better performance
-        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+        int end = start + params.n_ctx;
+
+        std::vector<float> logits;
+        int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
         auto start_t = std::chrono::high_resolution_clock::now();
-        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return;
+        for (int j = 0; j < num_batches; ++j) {
+            int batch_start = start + j * params.n_batch;
+            int batch_size = std::min(end - batch_start, params.n_batch);
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+            auto batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
         }
         auto end_t = std::chrono::high_resolution_clock::now();
         if (i == 0) {
@@ -59,15 +66,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
         // Example, we have a context window of 512, we will compute perplexity for each of the
         // last 256 tokens.  Then, we split the input up into context window size chunks to
         // process the entire prompt.
-
-        auto logits = llama_get_logits(ctx);
-        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
+        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
             // Calculate probability of next token, given the previous ones.
-            int n_vocab = llama_n_vocab(ctx);
             std::vector<float> tok_logits(
-                logits + j * n_vocab,
-                logits + (j + 1) * n_vocab);
-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+                logits.begin() + j * n_vocab,
+                logits.begin() + (j + 1) * n_vocab);
+            float prob = softmax(tok_logits)[tokens[start + j + 1]];
             nll += -std::log(prob);
             ++count;
         }
@@ -82,11 +86,13 @@ int main(int argc, char ** argv) {
     gpt_params params;
     params.model = "models/llama-7B/ggml-model.bin";
 
+    params.n_batch = 512;
     if (gpt_params_parse(argc, argv, params) == false) {
         return 1;
     }
 
     params.perplexity = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     if (params.n_ctx > 2048) {
         fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"

From c5d70f5c9ea5a8f0f6b0d6aa741455978a1dabfd Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Fri, 14 Apr 2023 14:24:52 +0800
Subject: [PATCH 19/34] ggml : optimize rope function to avoid call powf in the
 tight loop (#807)

---
 ggml.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/ggml.c b/ggml.c
index 42e3ee314..e2ebc9e4b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -7507,19 +7507,20 @@ static void ggml_compute_forward_rope_f32(
     // row index used to determine which thread to use
     int ir = 0;
 
+    const float theta_scale = powf(10000.0, ((float)-2)/n_dims);
+
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
             const int p = (mode == 0 ? n_past + i2 : i2);
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
-
+                float theta = (float)p;
                 for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float theta = powf(10000.0, ((float)-i0)/n_dims);
-
-                    const float cos_theta = cosf(p*theta);
-                    const float sin_theta = sinf(p*theta);
+                    const float cos_theta = cosf(theta);
+                    const float sin_theta = sinf(theta);
 
+                    theta *= theta_scale;
                     const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                           float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 
@@ -7580,19 +7581,20 @@ static void ggml_compute_forward_rope_f16(
     // row index used to determine which thread to use
     int ir = 0;
 
+    const float theta_scale = powf(10000.0, ((float)-2)/n_dims);
+
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
             const int p = (mode == 0 ? n_past + i2 : i2);
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
-
+                float theta = (float)p;
                 for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float theta = powf(10000.0, ((float)-i0)/n_dims);
-
-                    const float cos_theta = cosf(p*theta);
-                    const float sin_theta = sinf(p*theta);
+                    const float cos_theta = cosf(theta);
+                    const float sin_theta = sinf(theta);
 
+                    theta *= theta_scale;
                     const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                           ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 

From 0f07cacb05f49704d35a39aa27cfd4b419eb6f8d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Apr 2023 09:45:42 +0300
Subject: [PATCH 20/34] ggml : fix q4_1 dot product types

---
 ggml.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index e2ebc9e4b..d620cd11f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2344,14 +2344,14 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
 
 #if defined(__ARM_FEATURE_DOTPROD)
         // dot product into int32x4_t
-        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l);
-        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l);
+        uint32x4_t p_0 = vdotq_u32(vdupq_n_u32(0), v0_0l, v1_0l);
+        uint32x4_t p_1 = vdotq_u32(vdupq_n_u32(0), v0_1l, v1_1l);
 
-        p_0 = vdotq_s32(p_0, v0_0h, v1_0h);
-        p_1 = vdotq_s32(p_1, v0_1h, v1_1h);
+        p_0 = vdotq_u32(p_0, v0_0h, v1_0h);
+        p_1 = vdotq_u32(p_1, v0_1h, v1_1h);
 
-        sum11 += x0->d*y0->d*vaddvq_s32(p_0);
-        sum11 += x1->d*y1->d*vaddvq_s32(p_1);
+        sum11 += x0->d*y0->d*vaddvq_u32(p_0);
+        sum11 += x1->d*y1->d*vaddvq_u32(p_1);
 #else
         const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
         const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));

From 723dac55fa2ba7adc6e3fc8609781d1ad0378906 Mon Sep 17 00:00:00 2001
From: comex <comexk@gmail.com>
Date: Fri, 14 Apr 2023 00:03:03 -0700
Subject: [PATCH 21/34] py : new conversion script (#545)

Current status: Working, except for the latest GPTQ-for-LLaMa format
  that includes `g_idx`.  This turns out to require changes to GGML, so
  for now it only works if you use the `--outtype` option to dequantize it
  back to f16 (which is pointless except for debugging).

  I also included some cleanup for the C++ code.

  This script is meant to replace all the existing conversion scripts
  (including the ones that convert from older GGML formats), while also
  adding support for some new formats.  Specifically, I've tested with:

  - [x] `LLaMA` (original)
  - [x] `llama-65b-4bit`
  - [x] `alpaca-native`
  - [x] `alpaca-native-4bit`
  - [x] LLaMA converted to 'transformers' format using
        `convert_llama_weights_to_hf.py`
  - [x] `alpaca-native` quantized with `--true-sequential --act-order
        --groupsize 128` (dequantized only)
  - [x] same as above plus `--save_safetensors`
  - [x] GPT4All
  - [x] stock unversioned ggml
  - [x] ggmh

  There's enough overlap in the logic needed to handle these different
  cases that it seemed best to move to a single script.

  I haven't tried this with Alpaca-LoRA because I don't know where to find
  it.

  Useful features:

  - Uses multiple threads for a speedup in some cases (though the Python
    GIL limits the gain, and sometimes it's disk-bound anyway).

  - Combines split models into a single file (both the intra-tensor split
    of the original and the inter-tensor split of 'transformers' format
    files).  Single files are more convenient to work with and more
    friendly to future changes to use memory mapping on the C++ side.  To
    accomplish this without increasing memory requirements, it has some
    custom loading code which avoids loading whole input files into memory
    at once.

  - Because of the custom loading code, it no longer depends in PyTorch,
    which might make installing dependencies slightly easier or faster...
    although it still depends on NumPy and sentencepiece, so I don't know
    if there's any meaningful difference.  In any case, I also added a
    requirements.txt file to lock the dependency versions in case of any
    future breaking changes.

  - Type annotations checked with mypy.

  - Some attempts to be extra user-friendly:

      - The script tries to be forgiving with arguments, e.g. you can
        specify either the model file itself or the directory containing
        it.

      - The script doesn't depend on config.json / params.json, just in
        case the user downloaded files individually and doesn't have those
        handy.  But you still need tokenizer.model and, for Alpaca,
        added_tokens.json.

      - The script tries to give a helpful error message if
        added_tokens.json is missing.
---
 README.md                           |    4 +-
 convert-ggml-to-pth.py              |  299 -------
 convert-gpt4all-to-ggml.py          |  107 ---
 convert-gptq-to-ggml.py             |  172 ----
 convert-pth-to-ggml.py              |  277 +------
 convert-unversioned-ggml-to-ggml.py |  100 ---
 convert.py                          | 1143 +++++++++++++++++++++++++++
 migrate-ggml-2023-03-30-pr613.py    |  311 --------
 requirements.txt                    |    2 +
 9 files changed, 1154 insertions(+), 1261 deletions(-)
 delete mode 100644 convert-ggml-to-pth.py
 delete mode 100644 convert-gpt4all-to-ggml.py
 delete mode 100644 convert-gptq-to-ggml.py
 delete mode 100644 convert-unversioned-ggml-to-ggml.py
 create mode 100644 convert.py
 delete mode 100644 migrate-ggml-2023-03-30-pr613.py
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index c88e0de28..78215c9ce 100644
--- a/README.md
+++ b/README.md
@@ -192,10 +192,10 @@ ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
 
 # install Python dependencies
-python3 -m pip install torch numpy sentencepiece
+python3 -m pip install -r requirements.txt
 
 # convert the 7B model to ggml FP16 format
-python3 convert-pth-to-ggml.py models/7B/ 1
+python3 convert.py models/7B/
 
 # quantize the model to 4-bits (using method 2 = q4_0)
 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
diff --git a/convert-ggml-to-pth.py b/convert-ggml-to-pth.py
deleted file mode 100644
index 25a44237a..000000000
--- a/convert-ggml-to-pth.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Author: github.com/ductai199x
-import argparse
-import os
-import struct
-
-import numpy as np
-import torch
-from numba import njit
-from tqdm.auto import tqdm
-
-
-def read_header(fin):
-    values = struct.unpack("i" * 9, fin.read(4 * 9))
-    _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
-    return {
-        "vocab_size": vocab_size,
-        "dim": dim,
-        "multiple_of": multiple_of,
-        "n_heads": n_heads,
-        "n_layers": n_layers,
-    }, ftype
-
-
-def read_tokens(fin, vocab_size):
-    tokens = []
-    for _ in range(vocab_size):
-        text_len = struct.unpack("i", fin.read(4))[0]
-        text_bytes = fin.read(text_len)
-        try:
-            text = text_bytes.decode()
-        except UnicodeDecodeError:
-            text = text_bytes.decode(errors="replace")
-        score = struct.unpack("f", fin.read(4))[0]
-        tokens.append((text, score))
-    return tokens
-
-
-@njit
-def dequantize_weights_numba(fin_data, n_rows, n_cols):
-    qk = 32
-    nb = n_cols // qk
-    bs = 4 + (qk // 2)
-
-    weights = np.zeros((n_rows, n_cols), dtype=np.float32)
-    data_pos = 0
-
-    for row in range(n_rows):
-        for block in range(nb):
-            d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
-            data_pos += 4
-            packed_values = fin_data[data_pos : data_pos + (qk // 2)]
-            data_pos += qk // 2
-
-            for i in range(qk // 2):
-                packed_value = packed_values[i]
-                v0 = np.float32((packed_value & 0b00001111) - 8) * d
-                v1 = np.float32((packed_value >> 4) - 8) * d
-
-                weights[row, block * qk + 2 * i] = v0
-                weights[row, block * qk + 2 * i + 1] = v1
-
-    return weights
-
-
-def dequantize_weights(fin, n_rows, n_cols):
-    qk = 32
-    nb = n_cols // qk
-    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
-    fin_data = fin.read(data_size)
-    return dequantize_weights_numba(fin_data, n_rows, n_cols)
-
-
-def read_variables(fin):
-    model = {}
-    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
-    while True:
-        start_pos = fin.tell()
-        try:
-            n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
-        except struct.error:
-            break
-
-        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
-        shape = shape[::-1]
-        name = fin.read(name_length).decode()
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fin.tell()
-        tensor_data_offset = (tensor_data_offset + 31) & -32
-        fin.seek(tensor_data_offset)
-
-        if ftype_cur == 2:
-            # 4-bit quantized weights
-            dtype = np.uint8
-            data = dequantize_weights(fin, shape[0], shape[1])
-            data = data.reshape(shape)
-        elif ftype_cur == 0:
-            dtype = np.float32
-            data_size = np.prod(shape)
-            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-        elif ftype_cur == 1:
-            dtype = np.float16
-            data_size = np.prod(shape)
-            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-
-        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
-
-        pbar.update(fin.tell() - start_pos)
-
-    return model
-
-
-def convert_to_hf_format(model, hparams):
-    # This works for llama 7B, need to test with other models
-    n_layers = hparams["n_layers"]
-    n_heads = hparams["n_heads"]
-    dim = hparams["dim"]
-    dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
-    # permute for sliced rotary
-    def permute(w):
-        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
-
-    state_dict = {}
-    for layer_i in range(n_layers):
-        state_dict.update(
-            {
-                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                    model[f"layers.{layer_i}.attention.wq.weight"]
-                ),
-                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                    model[f"layers.{layer_i}.attention.wk.weight"]
-                ),
-                f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
-                    f"layers.{layer_i}.attention.wv.weight"
-                ],
-                f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
-                    f"layers.{layer_i}.attention.wo.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w1.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.down_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w2.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.up_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w3.weight"
-                ],
-                f"model.layers.{layer_i}.input_layernorm.weight": model[
-                    f"layers.{layer_i}.attention_norm.weight"
-                ],
-                f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
-                    f"layers.{layer_i}.ffn_norm.weight"
-                ],
-            }
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-    state_dict.update(
-        {
-            "model.embed_tokens.weight": model["tok_embeddings.weight"],
-            "model.norm.weight": model["norm.weight"],
-            "lm_head.weight": model["output.weight"],
-        }
-    )
-
-    return state_dict
-
-
-def chat(model, hparams, llama_dir):
-    from transformers import (GenerationConfig, LlamaForCausalLM,
-                              LlamaTokenizer, StoppingCriteria,
-                              StoppingCriteriaList)
-    from transformers.models.llama.configuration_llama import LlamaConfig
-
-    class StoppingCriteriaSub(StoppingCriteria):
-        def __init__(self):
-            super().__init__()
-
-        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
-            print(tokenizer.decode(input_ids[0]), end="", flush=True)
-            if input_ids[0][-1] == 13:
-                return True
-
-            return False
-
-    config = LlamaConfig(
-        vocab_size=hparams["vocab_size"],
-        dim=hparams["dim"],
-        num_hidden_layers=hparams["n_layers"],
-        num_attention_heads=hparams["n_heads"],
-    )
-
-    llama = LlamaForCausalLM(config=config)
-    llama.load_state_dict(state_dict=model, strict=True)
-    tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
-
-    device = torch.device("cpu")
-    llama = llama.to(device)
-
-    ctx = """You are AI.
-This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
-User: Hello, AI.
-AI: Hello! How can I assist you today?
-"""
-    print(ctx.rstrip("\n"))
-    while True:
-        print("-" * 60)
-        prompt = input("User: ")
-        if ctx != "":
-            ctx = f"{ctx}User: {prompt}\n"
-        else:
-            ctx = f"{prompt}\nAI:"
-
-        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
-
-        print("-" * 60)
-        if len(ctx.strip()) > 0:
-            input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
-            generation_config = GenerationConfig(
-                temperature=0.8,
-                top_p=0.95,
-                top_k=50,
-                repetition_penalty=1.1764,
-            )
-            with torch.no_grad():
-                generation_output = llama.generate(
-                    input_ids=input_ids,
-                    generation_config=generation_config,
-                    return_dict_in_generate=True,
-                    output_scores=True,
-                    max_length=2048,
-                    do_sample=True,
-                    stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
-                )
-            s = generation_output.sequences[0]
-            decoded = tokenizer.decode(s)
-            ctx = f"{decoded}\n"
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
-    )
-    parser.add_argument(
-        "--prefix",
-        "-p",
-        type=str,
-        required=True,
-        help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
-    )
-    parser.add_argument(
-        "--hf",
-        action="store_true",
-        help="Whether to save the model in the Hugging Face format. (default: False)",
-    )
-    parser.add_argument(
-        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
-    )
-    args = parser.parse_args()
-
-    llama_dir = os.path.abspath(f"{args.input_dir}/../")
-
-    ggml_files = sorted(
-        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
-    )
-
-    fin = open(ggml_files[0], "rb")
-    hparams, ftype = read_header(fin)
-    tokens = read_tokens(fin, hparams["vocab_size"])
-    model = read_variables(fin)
-
-    for f in tqdm(ggml_files[1:]):
-        fin = open(f, "rb")
-        read_header(fin)
-        read_tokens(fin, hparams["vocab_size"])
-        model.update(read_variables(fin))
-
-    if args.hf:
-        model = convert_to_hf_format(model, hparams)
-
-    pth_ckpt = {
-        "state_dict": model,
-        "hparams": hparams,
-        "tokens": tokens,
-    }
-
-    torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
-
-    if args.chat:
-        if not args.hf:
-            model = convert_to_hf_format(model, hparams)
-        chat(model, hparams, llama_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py
deleted file mode 100644
index b1a5e0560..000000000
--- a/convert-gpt4all-to-ggml.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-
-#
-# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
-#
-
-# Original by https://github.com/eiz
-# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
-import argparse
-import glob
-import os
-import struct
-import sys
-from sentencepiece import SentencePieceProcessor
-
-HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
-    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
-    return parser.parse_args()
-
-def read_header(f_in):
-    struct_fmt = "i" * (3 + len(HPARAMS))
-    struct_size = struct.calcsize(struct_fmt)
-    buf = f_in.read(struct_size)
-    return struct.unpack(struct_fmt, buf)
-
-def write_header(f_out, header):
-    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
-
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
-
-    values = [
-        0x67676d66, # magic: ggml in hex
-        1,          # file version
-        vocab_size,
-        dim,
-        multiple_of,
-        n_heads,
-        n_layers,
-        rot,
-        ftype
-    ]
-    f_out.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-    # TODO: GPT4All - add extra <pad> token
-    text = "<pad>".encode()
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    fout.write(struct.pack("f", 0.0))
-
-def read_tokens(f_in, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        len_b = f_in.read(4)
-        (length,) = struct.unpack("i", len_b)
-        f_in.read(length)
-
-def copy_all_data(f_out, f_in):
-    while True:
-        buf = f_in.read(1024 * 1024)
-        if not buf:
-            break
-        f_out.write(buf)
-
-def convert_one_file(path_in, tokenizer):
-    path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
-    print(f"converting {path_in}")
-    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
-        write_header(f_out, read_header(f_in))
-        read_tokens(f_in, tokenizer)
-        write_tokens(f_out, tokenizer)
-        copy_all_data(f_out, f_in)
-    os.rename(path_in, path_orig)
-    os.rename(path_tmp, path_in)
-
-def main():
-    args = parse_args()
-
-    tokenizer = SentencePieceProcessor(args.tokenizer_model)
-
-    convert_one_file(args.gpt4all_model, tokenizer)
-
-if __name__ == "__main__":
-    main()
diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py
deleted file mode 100644
index 42e99c2ff..000000000
--- a/convert-gptq-to-ggml.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Convert a GPTQ quantized LLaMA model to a ggml compatible file
-# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
-#
-import os
-import re
-import sys
-import json
-import struct
-import numpy as np
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if len(sys.argv) != 4:
-    print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
-    sys.exit(1)
-
-fname_model = sys.argv[1]
-fname_tokenizer = sys.argv[2]
-dir_out = sys.argv[3]
-
-model = torch.load(fname_model, map_location="cpu")
-
-n_vocab, n_embd = model['model.embed_tokens.weight'].shape
-n_layer = 1 + max(int(m.group(1)) for name in model
-                  if (m := re.match(r'model\.layers\.([0-9]+)', name)))
-
-# hardcoded:
-n_mult = 256
-n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
-
-tokenizer = SentencePieceProcessor(fname_tokenizer)
-
-assert tokenizer.vocab_size() == n_vocab
-
-fname_out = sys.argv[3]
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
-fout.write(struct.pack("i", 1)) # file version
-fout.write(struct.pack("i", n_vocab))
-fout.write(struct.pack("i", n_embd))
-fout.write(struct.pack("i", n_mult))
-fout.write(struct.pack("i", n_head))
-fout.write(struct.pack("i", n_layer))
-fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
-fout.write(struct.pack("i", 4))
-
-
-# This loop unchanged from convert-pth-to-ggml.py:
-for i in range(tokenizer.vocab_size()):
-    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode()
-    elif tokenizer.is_control(i):
-        text = b""
-    elif tokenizer.is_byte(i):
-        piece = tokenizer.id_to_piece(i)
-        if len(piece) != 6:
-            print(f"Invalid token: {piece}")
-            sys.exit(1)
-        byte_value = int(piece[3:-1], 16)
-        text = struct.pack("B", byte_value)
-    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode()
-    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
-    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
-    fout.write(sname)
-
-    # ensure tensor data is aligned
-    tensor_data_offset = fout.tell()
-    tensor_data_offset = (tensor_data_offset + 31) & -32
-    fout.seek(tensor_data_offset)
-
-def convert_non_q4(src_name, dst_name):
-    v = model[src_name]
-    shape = v.shape
-    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
-    if len(shape) == 1:
-        print("  Converting to float32")
-        v = v.to(torch.float32)
-
-    ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
-
-    # header
-    write_header(shape, dst_name, ftype_cur)
-
-    # data
-    v.numpy().tofile(fout)
-
-def convert_q4(src_name, dst_name, permute=False):
-    zeros = model[f"{src_name}.zeros"].numpy()
-    scales = model[f"{src_name}.scales"].numpy()
-    bias = model[f"{src_name}.bias"].numpy()
-    qweight = model[f"{src_name}.qweight"].numpy().T # transpose
-
-    # Q4_1 does not support bias; good thing the bias is always all zeros.
-    assert not np.any(bias)
-
-    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
-    shape = (qweight.shape[0], qweight.shape[1] * 8)
-
-    print(f"Processing Q4 variable: {src_name} with shape: {shape}")
-
-    # The output format has the int4 weights in groups of 32 rather than 8.
-    # It looks like this:
-    # For each row:
-    #   For each group of 32 columns:
-    #     - addend (float32, 4 bytes)
-    #     - scale (float32, 4 bytes)
-    #     - weights (int4 * 32, 16 bytes)
-    # Note that in the input, the scales and addends are shared between all
-    # the columns in a row, so we end up wasting quite a bit of memory with
-    # repeated scales and addends.
-
-    addends = -zeros # flip sign
-
-    # Since the output format is mixed between integers and floats, we have
-    # to hackily view the floats as int32s just so numpy will let us
-    # concatenate them.
-    addends_view = addends.view(dtype=np.int32)
-    scales_view = scales.view(dtype=np.int32)
-
-    # Split into groups of 4 columns (i.e. 32 columns of quantized data):
-    grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
-
-    # Repeat addends and scales:
-    addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
-    scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
-
-    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
-
-    if permute:
-        # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
-        # This can be done after the above conversion because it doesn't affect column order/layout.
-        blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
-                    .swapaxes(1, 2)
-                    .reshape(blob.shape))
-
-    # header
-    write_header(shape, dst_name, 3) # ftype = Q4_1
-
-    # data
-    blob.tofile(fout)
-
-convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
-convert_non_q4("model.norm.weight", "norm.weight")
-convert_non_q4("lm_head.weight", "output.weight")
-
-for i in range(n_layer):
-    convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
-    convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
-    convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
-    convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
-
-    convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
-    convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
-    convert_q4(f"model.layers.{i}.mlp.up_proj",   f"layers.{i}.feed_forward.w3.weight")
-
-    convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
-    convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
-
-
-fout.close()
-
-print(f"Done. Output file: {fname_out}")
-print()
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index dcef2f6a3..f87ac270c 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -1,274 +1,11 @@
-# Convert a LLaMA model checkpoint to a ggjt compatible file
-#
-# Load the model using Torch
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
+# Compatibility stub
 
 import argparse
-import os
-import sys
-import json
-import struct
-import numpy as np
-import torch
 
-from sentencepiece import SentencePieceProcessor
+import convert
 
-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
-
-def ggml_nelements(shape):
-    r = 1
-    for i in shape:
-        r *= i
-    return r
-
-def ggml_nbytes(shape, ftype):
-    x = ggml_nelements(shape)
-    t = WTYPES[ftype]
-    x *= GGML_TYPE_SIZE[t]
-    x //= GGML_BLCK_SIZE[t]
-    return x
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
-    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
-    return parser.parse_args()
-
-def get_n_parts(dim):
-    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
-    n_parts = mappings.get(dim)
-    if n_parts is None:
-        print(f"Invalid dim: {dim}")
-        sys.exit(1)
-
-    print(f"n_parts = {n_parts}\n")
-    return n_parts
-
-def load_hparams_and_tokenizer(dir_model):
-    # `dir_model` is something like `models/7B` or `models/7B/`.
-    # "tokenizer.model" is expected under model's parent dir.
-    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
-    # Let's use the model's parent dir directly.
-    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
-    fname_hparams = f"{dir_model}/params.json"
-    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
-    with open(fname_hparams, "r") as f:
-        hparams = json.load(f)
-        print(hparams)
-    tokenizer = SentencePieceProcessor(fname_tokenizer)
-    hparams.update({"vocab_size": tokenizer.vocab_size()})
-    return hparams, tokenizer
-
-def write_header(fout, hparams, ftype):
-    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-    values = [
-        0x67676a74,  # magic: ggjt in hex
-        1, # file version
-        *[hparams[key] for key in keys],
-        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
-        ftype
-    ]
-    fout.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def process_and_write_variables(fout, model, ftype, part_id, n_parts):
-    for name, datao in model.items():
-        if name.endswith("freqs"):
-            continue
-
-        # remove dimensions with a single element
-        data = datao.numpy().squeeze()
-        partshape = data.shape
-        n_dims = len(data.shape)
-        assert n_dims in (1, 2)
-
-        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
-
-        # coerce single-dimensional tensors from float16 to float32
-        ftype_cur = 1
-        if ftype == 0 or n_dims == 1:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
-
-        # determine dimension along which multipart tensor is sharded
-        #
-        # split_dim 0 regex:
-        #   - output.*
-        #   - layers.*.attention.wq.weight
-        #   - layers.*.attention.wk.weight
-        #   - layers.*.attention.wv.weight
-        #   - layers.*.feed_forward.w1.weight
-        #   - layers.*.feed_forward.w3.weight
-        #
-        # split_dim 1 regex:
-        #   - tok_embeddings.*
-        #   - layers.*.attention.wo.weight
-        #   - layers.*.feed_forward.w2.weight
-        #
-        if n_dims > 1:
-            split_dim = 1
-            if "tok_embeddings" in name:
-                split_dim = 1
-            elif "layers" in name:
-                if "attention.wo.weight" in name:
-                    split_dim = 1
-                elif "feed_forward.w2.weight" in name:
-                    split_dim = 1
-                else:
-                    split_dim = 0
-            elif "output" in name:
-                split_dim = 0
-
-        # output tensor header
-        fullshape = list(partshape)
-        if n_dims > 1:
-            fullshape[split_dim] *= n_parts
-        sname = name.encode()
-        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
-        for dim in reversed(fullshape):
-            fout.write(struct.pack("i", dim))
-        fout.write(sname)
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
-            fout.write(struct.pack("B", 0))
-            tensor_data_offset += 1
-
-        # output unified mappable tensor data
-        if n_dims == 1 or n_parts == 1:
-            # copy tensor which we thankfully received in one piece
-            if part_id == 0:
-                data.tofile(fout)
-        elif split_dim == 0:
-            # reassemble multifile tensor containing some of the rows
-            rows_per_chunk = partshape[0]
-            current_row = part_id * rows_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset = current_row * bytes_per_row
-            fout.seek(tensor_data_offset + offset)
-            data.tofile(fout)
-        elif split_dim == 1:
-            # reassemble multifile tensor containing some of the cols
-            cols_per_chunk = partshape[1]
-            current_col = part_id * cols_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset_current_col = current_col // blck_size * type_size
-            for row in range(partshape[0]):
-                offset_row = row * bytes_per_row
-                offset = offset_row + offset_current_col
-                fout.seek(tensor_data_offset + offset)
-                data[row].tofile(fout)
-
-        # advance file position to next tensor
-        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
-
-def main():
-    args = parse_args()
-    dir_model = args.dir_model
-    ftype = args.ftype
-    ftype_str = ["f32", "f16"]
-    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
-
-    print(args)
-
-    # if only writing vocab to file
-    if args.vocab_only:
-        fname_model = f"{dir_model}/consolidated.00.pth"
-        fname_out = f"{dir_model}/ggml-vocab.bin"
-        print(f"Extracting only the vocab from '{fname_model}'\n")
-        with open(fname_out, "wb") as fout:
-            write_header(fout, hparams, ftype)
-            write_tokens(fout, tokenizer)
-        print(f"Done. Output file: {fname_out}\n")
-        return
-
-    n_parts = get_n_parts(hparams["dim"])
-    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
-
-    # we output a single file for ggml
-    with open(fname_out, "wb") as fout:
-        write_header(fout, hparams, ftype)
-        write_tokens(fout, tokenizer)
-        offset_of_tensors = fout.tell()
-        # the tensors we load could be split across multiple files
-        for part_id in range(n_parts):
-            fout.seek(offset_of_tensors)
-            print(f"Processing part {part_id+1} of {n_parts}\n")
-            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
-            model = torch.load(fname_model, map_location="cpu")
-            process_and_write_variables(fout, model, ftype, part_id, n_parts)
-            del model
-
-    print(f"Done. Output file: {fname_out}\n")
-
-if __name__ == "__main__":
-    main()
+parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
+parser.add_argument('dir_model',  help='directory containing the model checkpoint')
+parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
+args = parser.parse_args()
+convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py
deleted file mode 100644
index 5151d9081..000000000
--- a/convert-unversioned-ggml-to-ggml.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-# Original by https://github.com/eiz
-# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
-import argparse
-import glob
-import os
-import struct
-import sys
-from sentencepiece import SentencePieceProcessor
-
-HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
-    parser.add_argument('dir_model', help='directory containing ggml .bin files')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
-    return parser.parse_args()
-
-def read_header(f_in):
-    struct_fmt = "i" * (3 + len(HPARAMS))
-    struct_size = struct.calcsize(struct_fmt)
-    buf = f_in.read(struct_size)
-    return struct.unpack(struct_fmt, buf)
-
-def write_header(f_out, header):
-    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
-
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
-
-    values = [
-        0x67676d66,  # magic: ggml in hex
-        1, # file version
-        vocab_size,
-        dim,
-        multiple_of,
-        n_heads,
-        n_layers,
-        rot,
-        ftype
-    ]
-    f_out.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def read_tokens(f_in, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        len_b = f_in.read(4)
-        (length,) = struct.unpack("i", len_b)
-        f_in.read(length)
-
-def copy_all_data(f_out, f_in):
-    while True:
-        buf = f_in.read(1024 * 1024)
-        if not buf:
-            break
-        f_out.write(buf)
-
-def convert_one_file(path_in, tokenizer):
-    path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
-    print(f"converting {path_in}")
-    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
-        write_header(f_out, read_header(f_in))
-        read_tokens(f_in, tokenizer)
-        write_tokens(f_out, tokenizer)
-        copy_all_data(f_out, f_in)
-    os.rename(path_in, path_orig)
-    os.rename(path_tmp, path_in)
-
-def main():
-    args = parse_args()
-    files = []
-    files.extend(glob.glob(f"{args.dir_model}/*.bin"))
-    files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
-
-    tokenizer = SentencePieceProcessor(args.tokenizer_model)
-
-    for file in files:
-        convert_one_file(file, tokenizer)
-
-if __name__ == "__main__":
-    main()
diff --git a/convert.py b/convert.py
new file mode 100644
index 000000000..f35163f67
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,1143 @@
+import argparse
+import concurrent.futures
+import copy
+import enum
+import faulthandler
+import functools
+import io
+import itertools
+import json
+import math
+import mmap
+import pickle
+import re
+import signal
+import struct
+import sys
+import zipfile
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+import numpy as np
+from sentencepiece import SentencePieceProcessor  # type: ignore
+from typing import (IO, Any, Callable, Iterable, Literal, Optional, Sequence,
+                    TypeVar, Union, List, Dict, Tuple, TYPE_CHECKING)
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
+    faulthandler.register(signal.SIGUSR1)
+
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+@dataclass(frozen=True)
+class UnquantizedDataType:
+    name: str
+
+
+DT_F16 = UnquantizedDataType('F16')
+DT_F32 = UnquantizedDataType('F32')
+DT_I32 = UnquantizedDataType('I32')
+DT_BF16 = UnquantizedDataType('BF16')
+
+
+@dataclass(frozen=True)
+class QuantizedDataType:
+    groupsize: int
+    have_addends: bool
+    have_g_idx: bool
+
+
+DT_Q4_0 = QuantizedDataType(groupsize=32, have_addends=False, have_g_idx=False)
+DT_Q4_1 = QuantizedDataType(groupsize=32, have_addends=True, have_g_idx=False)
+
+DataType = Union[UnquantizedDataType, QuantizedDataType]
+
+DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
+    DT_F32: 0,
+    DT_F16: 1,
+    DT_Q4_0: 2,
+    DT_Q4_1: 3,
+}
+
+FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
+    {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
+
+DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
+    DT_F16: np.dtype(np.float16),
+    DT_F32: np.dtype(np.float32),
+    DT_I32: np.dtype(np.int32),
+}
+
+NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
+    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+
+
+class GGMLFileType(enum.Enum):
+    AllF32 = 0
+    MostlyF16 = 1  # except 1d tensors
+    MostlyQ4_0 = 2  # except 1d tensors
+    MostlyQ4_1 = 3  # except 1d tensors
+    PerLayerIsQ4_1 = 4  # but tok_embeddings.weight and output.weight are F16
+
+    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
+        if len(tensor.shape) == 1:
+            # 1D tensors are always F32.
+            return DT_F32
+        elif self == GGMLFileType.AllF32:
+            return DT_F32
+        elif self == GGMLFileType.MostlyF16:
+            return DT_F16
+        elif self == GGMLFileType.MostlyQ4_0:
+            return DT_Q4_0
+        elif self == GGMLFileType.MostlyQ4_1:
+            return DT_Q4_1
+        elif self == GGMLFileType.PerLayerIsQ4_1:
+            if name in ('output.weight', 'tok_embeddings.weight'):
+                return DT_F16
+            else:
+                return DT_Q4_1
+        else:
+            raise ValueError(self)
+
+
+def make_tensors_list() -> List[str]:
+    ret = [
+        'tok_embeddings.weight',
+        'norm.weight',
+        'output.weight',
+    ]
+    for i in range(80):  # maximum number of layer
+        ret += [
+            f'layers.{i}.attention.wq.weight',
+            f'layers.{i}.attention.wk.weight',
+            f'layers.{i}.attention.wv.weight',
+            f'layers.{i}.attention.wo.weight',
+            f'layers.{i}.attention_norm.weight',
+            f'layers.{i}.feed_forward.w1.weight',
+            f'layers.{i}.feed_forward.w2.weight',
+            f'layers.{i}.feed_forward.w3.weight',
+            f'layers.{i}.atttention_norm.weight',
+            f'layers.{i}.ffn_norm.weight',
+        ]
+    return ret
+
+
+TENSORS_LIST = make_tensors_list()
+TENSORS_SET = set(TENSORS_LIST)
+
+
+@dataclass
+class Params:
+    n_vocab: int
+    n_embd: int
+    n_mult: int
+    n_head: int
+    n_layer: int
+    file_type: GGMLFileType
+
+    @staticmethod
+    def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
+        n_vocab, n_embd = model["tok_embeddings.weight"].shape
+
+        return Params(
+            n_vocab=n_vocab,
+            n_embd=n_embd,
+            n_mult=256,
+            n_head=n_embd // 128,
+            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
+            file_type=file_type,
+        )
+
+
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens))
+        else:
+            added_tokens = {}
+        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            text: bytes
+            if tokenizer.is_unknown(i):
+                text = " \u2047 ".encode("utf-8")
+            elif tokenizer.is_control(i):
+                text = b""
+            elif tokenizer.is_byte(i):
+                piece = tokenizer.id_to_piece(i)
+                if len(piece) != 6:
+                    raise Exception(f"Invalid token: {piece}")
+                byte_value = int(piece[3:-1], 16)
+                text = struct.pack("B", byte_value)
+            else:
+                text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            score: float = tokenizer.get_score(i)
+            yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class GGMLVocab:
+    def __init__(self, tokens: List[Tuple[bytes, float]]):
+        self.tokens = tokens
+        self.vocab_size = len(tokens)
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        return self.tokens
+
+    def __repr__(self) -> str:
+        return f"<GGMLVocab with {self.vocab_size} tokens>"
+
+
+Vocab = Union[SentencePieceVocab, GGMLVocab]
+
+
+def permute(weights: NDArray, n_head: int) -> NDArray:
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                   .swapaxes(1, 2)
+                   .reshape(weights.shape))
+
+
+def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
+    # First reinterpret each row from a list of int32s containing 8 values each
+    # to a list of uint8s containing 2 values each.
+    qvalues_pack8 = qvalues_pack32.view(np.uint8)
+
+    # Then split out the two values per int8 (which requires an actual
+    # conversion because numpy doesn't natively support int4s).
+    qvalues = np.zeros([qvalues_pack8.shape[0], qvalues_pack8.shape[1] * 2], dtype=np.uint8)
+    qvalues[:, 0::2] = qvalues_pack8 & 0xf
+    qvalues[:, 1::2] = qvalues_pack8 >> 4
+
+    assert addends is None or addends.shape == scales.shape
+    assert qvalues.shape[0] == scales.shape[0]
+    assert qvalues.shape[1] % scales.shape[1] == 0
+    if g_idx is None:
+        repeat_count = qvalues.shape[1] // scales.shape[1]
+        scales = scales[:, :, np.newaxis]
+        if addends is not None:
+            addends = addends[:, :, np.newaxis]
+        # Reshape so that the below computation broadcasts over scales and addends:
+        qvalues.shape = (qvalues.shape[0], scales.shape[1], int(repeat_count))
+    else:
+        # In this case the scale and addend is selected for each column by g_idx:
+        assert addends is not None
+        scales = scales[:, g_idx]
+        addends = addends[:, g_idx]
+    if addends is None:
+        # Q4_0
+        qvalues = qvalues.view(np.int8)
+        qvalues -= 8
+    # And do the actual 'value = scale * qvalue + addend' computation.
+    values = scales * qvalues
+    if addends is not None:
+        values += addends
+    if g_idx is None:
+        values.shape = (values.shape[0], values.shape[1] * values.shape[2])
+    return values
+
+
+class Tensor(metaclass=ABCMeta):
+    data_type: DataType
+
+    @abstractmethod
+    def astype(self, data_type: DataType) -> 'Tensor': ...
+    @abstractmethod
+    def permute(self, n_head: int) -> 'Tensor': ...
+    @abstractmethod
+    def to_ggml(self) -> 'GGMLCompatibleTensor': ...
+
+
+class UnquantizedTensor(Tensor):
+    def __init__(self, ndarray: NDArray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        self.ndarray = ndarray
+        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
+
+    def astype(self, data_type: DataType) -> Tensor:
+        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        return UnquantizedTensor(self.ndarray.astype(dtype))
+
+    def to_ggml(self) -> 'UnquantizedTensor':
+        return self
+
+    def permute(self, n_head: int) -> 'UnquantizedTensor':
+        return UnquantizedTensor(permute(self.ndarray, n_head))
+
+
+def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
+    tensor = lazy_tensor.load()
+    assert isinstance(tensor, UnquantizedTensor)
+
+    # double-check:
+    actual_shape = list(tensor.ndarray.shape)
+    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
+    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
+        if convert:
+            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
+        else:
+            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
+
+    return tensor.ndarray
+
+
+class GGMLQuantizedTensor(Tensor):
+    data_type: QuantizedDataType
+
+    def __init__(self, ndarray: NDArray, shape: List[int], data_type: DataType) -> None:
+        rows, columns = shape
+        assert data_type in (DT_Q4_1, DT_Q4_0)  # for now
+        assert isinstance(data_type, QuantizedDataType)  # redundant, but mypy complains without this
+        assert columns % data_type.groupsize == 0
+        words_in_block = 6 if data_type == DT_Q4_1 else 5
+        self.ndarray = ndarray.view(dtype=np.uint32).reshape((rows, columns // data_type.groupsize, words_in_block))
+        self.shape = shape[:]
+        self.data_type = data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if data_type == self.data_type:
+            return self
+        scales = self.ndarray[:, :, 0].view(np.float32)
+        if self.data_type.have_addends:
+            addends = self.ndarray[:, :, 1].view(np.float32)
+        else:
+            addends = None
+        qweights = self.ndarray[:, :, -4:].reshape([self.shape[0], self.shape[1] // 8])
+
+        dq = dequantize_q4(qweights, scales, addends, g_idx=None)
+        return UnquantizedTensor(dq).astype(data_type)
+
+    def to_ggml(self) -> 'GGMLQuantizedTensor':
+        return self
+
+    def permute(self, n_head: int) -> 'GGMLQuantizedTensor':
+        return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
+
+
+GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
+
+
+class DeferredPermutedTensor(Tensor):
+    def __init__(self, base: Tensor, n_head: int) -> None:
+        self.base = base
+        self.n_head = n_head
+        self.data_type = self.base.data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        return self.base.astype(data_type).permute(self.n_head)
+
+    def to_ggml(self) -> GGMLCompatibleTensor:
+        return self.base.to_ggml().permute(self.n_head)
+
+    def permute(self, n_head: int) -> Tensor:
+        raise Exception("shouldn't permute twice")
+
+
+class GPTQForLLaMaQuantizedTensor(Tensor):
+    def __init__(self, model: 'LazyModel', namebase: str) -> None:
+        qweight = load_unquantized(model[f"{namebase}.qweight"], np.int32)
+        scales = load_unquantized(model[f"{namebase}.scales"], np.float32, convert=True)
+
+        bias = model.get(f"{namebase}.bias")
+        if bias is not None:
+            # Q4_1 does not support bias; good thing the bias is always all zeros.
+            assert not np.any(load_unquantized(bias))
+
+        if f"{namebase}.zeros" in model:
+            zeros = load_unquantized(model[f"{namebase}.zeros"], np.float32)
+        else:
+            qzeros = load_unquantized(model[f"{namebase}.qzeros"], np.int32)
+            assert qzeros.dtype == np.int32
+            zeros = dequantize_q4(qzeros, scales, scales, g_idx=None)
+            assert zeros.dtype == np.float32
+
+        assert zeros.shape == scales.shape
+
+        # Output is transposed compared to the input, and addends have their sign flipped.
+        # Scales and zeros similarly must be transposed but only for newer
+        # versions of GPTQ-for-LLaMa; the older versions can be identified by
+        # having shape (n_embd, 1).
+        qweight = qweight.T
+        if scales.shape[1] != 1:
+            scales = scales.T
+            zeros = zeros.T
+
+        # Output also has signs flipped for the addends.
+        self.qweight = qweight
+        self.scales = scales
+        self.addends = -zeros
+
+        self.g_idx: Optional[NDArray]
+        if f"{namebase}.g_idx" in model:
+            self.g_idx = load_unquantized(model[f"{namebase}.g_idx"], np.int32)
+            assert self.g_idx.shape == (qweight.shape[1] * 8,)
+        else:
+            self.g_idx = None
+
+        self.shape = [self.qweight.shape[0], self.qweight.shape[1] * 8]
+        self.data_type = QuantizedDataType(groupsize=self.groupsize(), have_addends=True,
+                                           have_g_idx=(self.g_idx is not None))
+
+    def inspect(self, row: int, col: int) -> None:
+        '''For debugging.'''
+        qweight = (self.qweight[row, col // 8] >> (4 * (col & 7))) & 0xf
+        if self.g_idx is not None:
+            group = self.g_idx[col]
+        else:
+            group = int(col // self.groupsize())
+        scale = self.scales[row, group]
+        addend = self.addends[row, group]
+        with np.printoptions(precision=None, suppress=True):
+            print(f'scale:{scale} addend:{addend} qweight:{qweight}')
+            print('possible values:', np.arange(16) * scale + addend)
+            print('actual value:', qweight * scale + addend)
+
+    def astype(self, data_type: DataType) -> Tensor:
+        if isinstance(data_type, QuantizedDataType):
+            assert self.g_idx is None and data_type.have_addends is True and data_type.have_g_idx is False
+            return self.regroup(data_type.groupsize)
+
+        dequantized = dequantize_q4(np.ascontiguousarray(self.qweight), self.scales, self.addends, self.g_idx)
+        return UnquantizedTensor(dequantized).astype(data_type)
+
+    def groupsize(self) -> int:
+        assert self.addends.shape == self.scales.shape
+        assert self.shape[1] % self.scales.shape[1] == 0
+        return self.shape[1] // self.scales.shape[1]
+
+    def regroup(self, new_groupsize: int = 32) -> 'GPTQForLLaMaQuantizedTensor':
+        # Old versions of GPTQ-for-LLaMa shared scales and addends between all the
+        # columns in a row.  Newer versions share them between every set of N
+        # columns in a row, where N is the `groupsize` parameter, usually 128.  The
+        # output format shares them between every set of 32 columns.  To handle
+        # this, duplicate scales and addends for every smaller group.
+        # (In the above, 'row' and 'column' are in the sense of the output.)
+        assert self.g_idx is None
+        old_groupsize = self.groupsize()
+        assert old_groupsize >= new_groupsize and old_groupsize % new_groupsize == 0, old_groupsize
+        ret = copy.copy(self)
+        ret.addends = self.addends.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.scales = self.scales.repeat(old_groupsize // new_groupsize, axis=1)
+        ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
+        return ret
+
+    def permute(self, n_head: int) -> Tensor:
+        return DeferredPermutedTensor(self, n_head)
+
+    def to_ggml(self) -> GGMLQuantizedTensor:
+        # The output format looks like this:
+        # For each row:
+        #   For each group of 32 columns:
+        #     - addend (float32, 4 bytes)
+        #     - scale (float32, 4 bytes)
+        #     - weights (int4 * 32, 16 bytes)
+
+        if self.groupsize() != 32:
+            raise Exception("should have been regrouped before converting to ggml")
+
+        # Since the output format is mixed between integers and floats, we have
+        # to hackily view the floats as int32s just so numpy will let us
+        # concatenate them.
+        addends_view = self.addends.view(dtype=np.int32)[:, :, np.newaxis]
+        scales_view = self.scales.view(dtype=np.int32)[:, :, np.newaxis]
+
+        # Split into groups of 4 columns (i.e. 32 columns of quantized data):
+        grouped = self.qweight.reshape([self.qweight.shape[0], self.qweight.shape[1] // 4, 4])
+
+        # And concatenate:
+        grouped = np.concatenate([scales_view, addends_view, grouped], axis=2, casting='no')
+
+        return GGMLQuantizedTensor(grouped, self.shape, DT_Q4_1)
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], Tensor]
+    shape: List[int]
+    data_type: DataType
+    description: str
+
+    def load(self) -> Tensor:
+        ret = self._load()
+        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        return ret
+
+    def astype(self, data_type: DataType) -> 'LazyTensor':
+        self.validate_conversion_to(data_type)
+
+        def load() -> Tensor:
+            return self.load().astype(data_type)
+        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
+
+    def validate_conversion_to(self, data_type: DataType) -> None:
+        if data_type == self.data_type:
+            return
+        if isinstance(data_type, QuantizedDataType):
+            if not isinstance(self.data_type, QuantizedDataType):
+                raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
+            if self.data_type.have_g_idx:
+                sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML.  For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
+                sys.exit(1)
+            assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
+
+
+LazyModel = Dict[str, LazyTensor]
+
+
+@dataclass
+class ModelPlus:
+    model: LazyModel
+    paths: List[Path]  # Where this was read from.
+    format: Literal['ggml', 'torch', 'safetensors']
+    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.
+
+
+def merge_sharded(models: List[LazyModel]) -> LazyModel:
+    # Original LLaMA models have each file contain one part of each tensor.
+    # Use a dict instead of a set to preserve order.
+    names = {name: None for model in models for name in model}
+
+    def convert(name: str) -> LazyTensor:
+        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
+        if len(lazy_tensors) == 1:
+            # only one file; don't go through this procedure since there might
+            # be quantized tensors
+            return lazy_tensors[0]
+        if len(lazy_tensors[0].shape) == 1:
+            # the tensor is just duplicated in every file
+            return lazy_tensors[0]
+        if name.startswith('tok_embeddings.') or \
+           name.endswith('.attention.wo.weight') or \
+           name.endswith('.feed_forward.w2.weight'):
+            # split by columns
+            axis = 1
+        else:
+            # split by rows
+            axis = 0
+        concatenated_shape = list(lazy_tensors[0].shape)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
+
+        def load() -> UnquantizedTensor:
+            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
+            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            return UnquantizedTensor(concatenated)
+        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
+    return {name: convert(name) for name in names}
+
+
+def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
+    formats = set(mp.format for mp in models_plus)
+    assert len(formats) == 1, "different formats?"
+    format = formats.pop()
+    paths = [path for mp in models_plus for path in mp.paths]
+    # Use the first non-None vocab, if any.
+    try:
+        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
+    except StopIteration:
+        vocab = None
+
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+        # Transformers models put different tensors in different files, but
+        # don't split indivdual tensors between files.
+        model: LazyModel = {}
+        for mp in models_plus:
+            model.update(mp.model)
+    else:
+        model = merge_sharded([mp.model for mp in models_plus])
+
+    return ModelPlus(model, paths, format, vocab)
+
+
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute(n_head)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+
+
+def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
+    out: LazyModel = {}
+    out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
+    out["norm.weight"] = model["model.norm.weight"]
+    out["output.weight"] = model["lm_head.weight"]
+
+    n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
+    for i in itertools.count():
+        if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
+            break
+        out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
+        out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
+        out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
+
+        out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
+        out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
+        out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
+
+        out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
+        out[f"layers.{i}.ffn_norm.weight"] = model[f"model.layers.{i}.post_attention_layernorm.weight"]
+    return out
+
+
+def handle_quantization(model: LazyModel) -> LazyModel:
+    '''Convert a model with entries for 'foo.qweight', 'foo.scales', etc.
+    (which resolve to UnquantizedTensors with the raw data) to one with entries
+    for 'foo.weight' (which resolve to QuantizedTensors).
+    '''
+    def convert(name: str) -> Tuple[str, LazyTensor]:
+        if name.endswith(".qweight"):
+            namebase = name.rsplit('.', 1)[0]
+            orig_name = namebase + ".weight"
+
+            lazy_tensor = model[name]
+            assert len(lazy_tensor.shape) == 2
+            real_shape = [lazy_tensor.shape[1], lazy_tensor.shape[0] * 8]
+
+            # Calculate type.  This replicates the logic in
+            # GPTQForLLaMaQuantizedTensor (which is executed when the modelis
+            # actually loaded).
+            lazy_scales = model[f"{namebase}.scales"]
+            scales_width = 1 if lazy_scales.shape[1] == 1 else lazy_scales.shape[0]
+            assert real_shape[1] % scales_width == 0
+            groupsize = real_shape[1] // scales_width
+            have_g_idx = f"{namebase}.g_idx" in model
+            data_type = QuantizedDataType(groupsize=groupsize, have_addends=True, have_g_idx=have_g_idx)
+
+            def load() -> Tensor:
+                return GPTQForLLaMaQuantizedTensor(model, namebase)
+
+            return (orig_name, LazyTensor(load, real_shape, data_type, '[quantized]'))
+        else:
+            return (name, model[name])
+    return dict(convert(name) for name in model)
+
+# Functionality that simulates `torch.load` but where individual tensors are
+# only loaded into memory on demand, not all at once.
+# PyTorch can't do this natively as of time of writing:
+# - https://github.com/pytorch/pytorch/issues/64327
+# This allows us to de-shard without multiplying RAM usage, and also
+# conveniently drops the PyTorch dependency (though we still need numpy).
+
+
+@dataclass
+class LazyStorageKind:
+    data_type: DataType
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], NDArray]
+    kind: LazyStorageKind
+    description: str
+
+
+class LazyUnpickler(pickle.Unpickler):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+        super().__init__(fp)
+        self.data_base_path = data_base_path
+        self.zip_file = zip_file
+
+    def persistent_load(self, pid: Any) -> Any:
+        assert pid[0] == 'storage'
+        assert isinstance(pid[1], LazyStorageKind)
+        data_type = pid[1].data_type
+        filename_stem = pid[2]
+        filename = self.data_base_path + '/' + filename_stem
+        info = self.zip_file.getinfo(filename)
+
+        def load(offset: int, elm_count: int) -> NDArray:
+            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
+            if dtype is None:
+                raise Exception("tensor stored in unsupported format")
+            fp = self.zip_file.open(info)
+            fp.seek(offset * dtype.itemsize)
+            size = elm_count * dtype.itemsize
+            data = fp.read(size)
+            assert len(data) == size
+            return np.frombuffer(data, dtype)
+        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
+        return LazyStorage(load=load, kind=pid[1], description=description)
+
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
+                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
+        assert isinstance(storage, LazyStorage)
+
+        def load() -> UnquantizedTensor:
+            elm_count = stride[0] * size[0]
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
+        description = f'pickled storage_offset={storage_offset} in {storage.description}'
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
+
+    CLASSES: Dict[Any, Any] = {
+        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+    }
+
+    def find_class(self, module: str, name: str) -> Any:
+        if not module.startswith('torch'):
+            return super().find_class(module, name)
+        return self.CLASSES[(module, name)]
+
+
+def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
+    zf = zipfile.ZipFile(outer_fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
+    assert len(pickle_paths) == 1, pickle_paths
+    pickle_fp = zf.open(pickle_paths[0], 'r')
+    unpickler = LazyUnpickler(pickle_fp,
+                              data_base_path=pickle_paths[0][:-4],
+                              zip_file=zf)
+    model = unpickler.load()
+    as_dict = dict(model.items())
+    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
+
+
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    'F16': DT_F16,
+    'F32': DT_F32,
+    'I32': DT_I32,
+}
+
+
+def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    header_size, = struct.unpack('<Q', fp.read(8))
+    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    byte_buf = mapped[fp.tell():]
+
+    def convert(info: Dict[str, Any]) -> LazyTensor:
+        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
+        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        shape: List[int] = info['shape']
+        begin, end = info['data_offsets']
+        assert 0 <= begin <= end <= len(byte_buf)
+        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
+        buf = byte_buf[begin:end]
+
+        def load() -> UnquantizedTensor:
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
+        return LazyTensor(load, shape, data_type, description)
+    model = {name: convert(info) for (name, info) in header.items()}
+    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
+
+
+def must_read(fp: IO[bytes], length: int) -> bytes:
+    ret = fp.read(length)
+    if len(ret) < length:
+        raise Exception("unexpectedly reached end of file")
+    return ret
+
+
+def lazy_load_ggml_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    magic = must_read(fp, 4)[::-1]
+    if magic in (b'ggmf', b'ggjt'):
+        version, = struct.unpack("i", must_read(fp, 4))
+        assert version == 1
+    else:
+        assert magic == b'ggml'
+        version = None
+    n_vocab, n_embd, n_mult, n_head, n_layer, rot, file_type = struct.unpack('<7i', must_read(fp, 28))
+
+    tokens: List[Tuple[bytes, float]] = []
+    for i in range(n_vocab):
+        if i == 32000:
+            # HACK: GPT4All messed with the format without changing the magic
+            # number.  Specifically, they changed the vocab section to contain
+            # `n_vocab - 1` tokens instead of `n_vocab` (i.e. omitting the
+            # extra pad token).  Try to detect if we're reading a file like
+            # this.
+            orig_pos = fp.tell()
+            fp.seek(20, io.SEEK_CUR)
+            is_gpt4all = fp.read(21) == b'tok_embeddings.weight'
+            fp.seek(orig_pos)
+            if is_gpt4all:
+                break
+
+        length, = struct.unpack("i", must_read(fp, 4))
+        text = must_read(fp, length)
+        if magic != b'ggml':
+            score, = struct.unpack("f", must_read(fp, 4))
+            tokens.append((text, score))
+    vocab = GGMLVocab(tokens) if magic != b'ggml' else None
+
+    model: LazyModel = {}
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+
+    def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
+        shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
+        assert 0 <= shape_len <= 3
+        shape: List[int] = list(struct.unpack(f"{shape_len}i", must_read(fp, 4 * shape_len)))
+        shape = shape[::-1]
+        name = must_read(fp, name_len).decode('utf-8')
+        data_type = FTYPE_TO_DATA_TYPE[ftype]
+
+        if magic == b'ggjt':
+            fp.seek((fp.tell() + 31) & -32)
+
+        if data_type == DT_Q4_1:
+            # See GPTQForLLaMaQuantizedTensor.ggml_ndarray()
+            size = 24 * (shape[1] // 32) * shape[0]
+        elif data_type == DT_Q4_0:
+            size = 20 * (shape[1] // 32) * shape[0]
+        else:
+            numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+            elm_count = math.prod(shape)
+            size = elm_count * numpy_dtype.itemsize
+        offset = fp.tell()
+        buf = mapped[offset:offset+size]
+        fp.seek(size, io.SEEK_CUR)
+
+        def load() -> Tensor:
+            if isinstance(data_type, QuantizedDataType):
+                ndarray = np.frombuffer(buf, dtype=np.uint32)
+                return GGMLQuantizedTensor(ndarray, shape, data_type)
+            else:
+                return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+        description = f'ggml offset={offset} type={data_type} path={path}'
+        model[name] = LazyTensor(load, shape, data_type, description)
+
+    while fp.read(1) != b'':
+        fp.seek(-1, io.SEEK_CUR)
+        read_tensor()
+
+    return ModelPlus(model=model, paths=[path], format='ggml', vocab=vocab)
+
+
+@functools.lru_cache(maxsize=None)
+def lazy_load_file(path: Path) -> ModelPlus:
+    fp = open(path, 'rb')
+    first8 = fp.read(8)
+    fp.seek(0)
+    if first8[:2] == b'PK':
+        # A zip file, i.e. PyTorch format
+        return lazy_load_torch_file(fp, path)
+    elif first8[2:4] == b'gg':
+        # GGML format
+        return lazy_load_ggml_file(fp, path)
+    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
+        # Probably safetensors
+        return lazy_load_safetensors_file(fp, path)
+    else:
+        raise ValueError(f"unknown format: {path}")
+
+
+In = TypeVar('In')
+Out = TypeVar('Out')
+
+
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
+    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
+    fast enough, this will stop calling `func` at some point rather than
+    letting results pile up in memory.  Specifically, there is a max of one
+    output value buffered per thread.'''
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures: List[concurrent.futures.Future[Out]] = []
+        items_rev = list(iterable)[::-1]
+        for i in range(min(concurrency, len(items_rev))):
+            futures.append(executor.submit(func, items_rev.pop()))
+        while futures:
+            result = futures.pop(0).result()
+            if items_rev:
+                futures.append(executor.submit(func, items_rev.pop()))
+            yield result
+
+
+def check_vocab_size(params: Params, vocab: Vocab) -> None:
+    if params.n_vocab != vocab.vocab_size:
+        # GGMLVocab comes from the same file as the model so shouldn't mismatch:
+        assert isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size_base:
+            print("Ignoring added_tokens.json since model matches vocab size without it.")
+            vocab.added_tokens_list = []
+            vocab.vocab_size = vocab.vocab_size_base
+            return
+        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+        if vocab.fname_added_tokens is not None:
+            msg += f" combined with {vocab.fname_added_tokens}"
+        msg += f" has {vocab.vocab_size})."
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        raise Exception(msg)
+
+
+class OutputFile:
+    def __init__(self, fname_out: Path) -> None:
+        self.fout = open(fname_out, "wb")
+
+    def write_file_header(self, params: Params) -> None:
+        self.fout.write(b"ggjt"[::-1])  # magic
+        values = [
+            1,  # file version
+            params.n_vocab,
+            params.n_embd,
+            params.n_mult,
+            params.n_head,
+            params.n_layer,
+            params.n_embd // params.n_head,  # rot (obsolete)
+            params.file_type.value,
+        ]
+        self.fout.write(struct.pack("i" * len(values), *values))
+
+    def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
+        sname = name.encode('utf-8')
+        self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
+        self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+        self.fout.write(sname)
+        self.fout.seek((self.fout.tell() + 31) & -32)
+
+    def write_vocab(self, vocab: Vocab) -> None:
+        for text, score in vocab.all_tokens():
+            self.fout.write(struct.pack("i", len(text)))
+            self.fout.write(text)
+            self.fout.write(struct.pack("f", score))
+
+    @staticmethod
+    def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        of = OutputFile(fname_out)
+        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
+                        n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
+        of = OutputFile(fname_out)
+        of.write_file_header(params)
+        of.write_vocab(vocab)
+        of.fout.close()
+
+    @staticmethod
+    def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+        check_vocab_size(params, vocab)
+        of = OutputFile(fname_out)
+        of.write_file_header(params)
+        print("Writing vocab...")
+        of.write_vocab(vocab)
+
+        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
+            name, lazy_tensor = item
+            return lazy_tensor.load().to_ggml().ndarray
+
+        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            size = ' x '.join(map(str, lazy_tensor.shape))
+            print(f"[{i+1}/{len(model)}] Writing tensor {name}, size {size}...")
+            of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
+            ndarray.tofile(of.fout)
+        of.fout.close()
+
+
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
+    wq_type = model["layers.0.attention.wq.weight"].data_type
+    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
+        return GGMLFileType.AllF32
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
+        return GGMLFileType.MostlyF16
+    if output_type_str == "q4_1" or (output_type_str is None and isinstance(wq_type, QuantizedDataType) and
+                                     wq_type.have_addends):
+        if isinstance(model["output.weight"].data_type, QuantizedDataType):
+            return GGMLFileType.MostlyQ4_1
+        else:
+            return GGMLFileType.PerLayerIsQ4_1
+    if output_type_str == "q4_0" or (output_type_str is None and isinstance(wq_type, QuantizedDataType)):
+        return GGMLFileType.MostlyQ4_0
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+    raise Exception(f"Unexpected combination of types: {name_to_type}")
+
+
+def do_necessary_conversions(model: LazyModel) -> LazyModel:
+    model = handle_quantization(model)
+
+    if "lm_head.weight" in model:
+        model = convert_transformers_to_orig(model)
+    model = filter_and_sort_tensors(model)
+
+    return model
+
+
+def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
+            for (name, tensor) in model.items()}
+
+
+def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the nth path in the model.
+    '''
+    # Support the following patterns:
+    patterns: List[Tuple[str, str]] = [
+        # - x.00.pth, x.01.pth, etc.
+        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
+        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
+        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
+        # x.bin, x.bin.1, etc.
+        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
+    ]
+    for regex, replacement in patterns:
+        if re.search(regex, path.name):
+            new_path = path.with_name(re.sub(regex, replacement, path.name))
+            if new_path.exists():
+                return new_path
+    return None
+
+
+def find_multifile_paths(path: Path) -> List[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the whole list of paths in the model.
+    '''
+    ret: List[Path] = []
+    for i in itertools.count():
+        nth_path = nth_multifile_path(path, i)
+        if nth_path is None:
+            break
+        ret.append(nth_path)
+    if not ret:
+        # No matches.  This should only happen if the file was named, e.g.,
+        # foo.0, and there was no file named foo.  Oh well, try to process it
+        # as a single file.
+        return [path]
+    return ret
+
+
+def load_some_model(path: Path) -> ModelPlus:
+    '''Load a model of any supported format.'''
+    # Be extra-friendly and accept either a file or a directory:
+    if path.is_dir():
+        globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
+        files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            # Try GGML too, but with lower priority, since if both a non-GGML
+            # model and a GGML model exist in the same directory, we assume the
+            # latter was converted from the former.
+            files = list(path.glob("ggml-model*.bin*"))
+        if not files:
+            raise Exception(f"Can't find model in directory {path}")
+        if len(files) > 1:
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+        path = files[0]
+
+    paths = find_multifile_paths(path)
+    models_plus: List[ModelPlus] = []
+    for path in paths:
+        print(f"Loading model file {path}")
+        models_plus.append(lazy_load_file(path))
+
+    model_plus = merge_multifile_models(models_plus)
+    return model_plus
+
+
+def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
+    return {name: model[name] for name in TENSORS_LIST if name in model}
+
+
+def load_vocab(path: Path) -> SentencePieceVocab:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        path2 = path / "tokenizer.model"
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / "tokenizer.model"
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
+    added_tokens_path = path.parent / "added_tokens.json"
+    print(f"Loading vocab file {path}")
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+
+
+def default_outfile(model_paths: List[Path], params: Params) -> Path:
+    namestr = {
+        GGMLFileType.AllF32: "f32",
+        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ4_1: "q4_1",
+        GGMLFileType.PerLayerIsQ4_1: "q4_1",
+    }[params.file_type]
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
+    if ret in model_paths:
+        sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input.  Please explicitly specify a path using --outfile.\n")
+        sys.exit(1)
+    return ret
+
+
+def do_dump_model(model_plus: ModelPlus) -> None:
+    print(f"model_plus.paths = {model_plus.paths!r}")
+    print(f"model_plus.format = {model_plus.format!r}")
+    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    for name, lazy_tensor in model_plus.model.items():
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1"], help="output format (default: based on input)")
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    args = parser.parse_args(args_in)
+
+    vocab: Vocab
+    if args.dump_single:
+        model_plus = lazy_load_file(args.model)
+        do_dump_model(model_plus)
+    elif args.vocab_only:
+        vocab = load_vocab(args.vocab_dir or args.model)
+        assert args.outfile, "need --outfile if using --vocab-only"
+        outfile = args.outfile
+        OutputFile.write_vocab_only(outfile, vocab)
+        print(f"Wrote {outfile}")
+    else:
+        model_plus = load_some_model(args.model)
+        if args.dump:
+            do_dump_model(model_plus)
+            return
+        if model_plus.vocab is not None and args.vocab_dir is None:
+            vocab = model_plus.vocab
+        else:
+            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+            vocab = load_vocab(vocab_dir)
+        model = model_plus.model
+        model = do_necessary_conversions(model)
+        output_type = pick_output_type(model, args.outtype)
+        model = convert_to_output_type(model, output_type)
+        params = Params.guessed(model, output_type)
+        outfile = args.outfile or default_outfile(model_plus.paths, params)
+        OutputFile.write_all(outfile, params, model, vocab)
+        print(f"Wrote {outfile}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/migrate-ggml-2023-03-30-pr613.py b/migrate-ggml-2023-03-30-pr613.py
deleted file mode 100644
index b6ef2476e..000000000
--- a/migrate-ggml-2023-03-30-pr613.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
-#
-# We caused a breaking change to the file format on 2023-03-30 in:
-#     https://github.com/ggerganov/llama.cpp/pull/613
-#
-# (1) If you still have the Meta LLaMA .pth files, then close this
-#     file now; you can just run `convert-pth-to-ggml.py` again to
-#     migrate to the new format. The tool is easier to use too. It
-#     isn't necessary anymore to manage split output files because
-#     the new format always combines things into a single file.
-#
-# (2) If you deleted the Meta LLaMA .pth files due to save on disk
-#     space, then this tool is intended to help you.  Please check
-#     out the instructions below.
-#
-# USAGE
-#
-#     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
-#
-# PREREQUISITES
-#
-#     pip install numpy
-#     cd llama.cpp
-#     make -j4
-#
-# EXAMPLE (7B MODEL)
-#
-#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
-#     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
-#
-#     # check that it works
-#     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
-#
-#     # you can delete the old files
-#     rm -f models/7B/ggml-model-f16.bin
-#     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
-#
-# EXAMPLE (13B MODEL)
-#
-#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
-#     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
-#
-#     # check that it works
-#     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
-#
-#     # you can delete the old files
-#     rm -f models/13B/ggml-model-f16.bin*
-#     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
-#
-
-import argparse
-import os
-import sys
-import json
-import struct
-import numpy as np
-
-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPE_NAMES = {
-    0: "F32",
-    1: "F16",
-    2: "Q4_0",
-    3: "Q4_1",
-}
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
-
-HPARAMS = [
-    'magic',    # int32
-    'version',  # int32
-    'n_vocab',  # int32
-    'n_embd',   # int32
-    'n_mult',   # int32
-    'n_head',   # int32
-    'n_layer',  # int32
-    'n_rot',    # int32
-    'f16',      # int32
-]
-
-def read_hparams(fin):
-    struct_fmt = "i" * len(HPARAMS)
-    struct_size = struct.calcsize(struct_fmt)
-    buf = fin.read(struct_size)
-    ints = struct.unpack(struct_fmt, buf)
-    hparams = dict(zip(HPARAMS, ints))
-    return hparams
-
-def write_hparams(fout, hparams):
-    struct_fmt = "i" * len(HPARAMS)
-    struct_size = struct.calcsize(struct_fmt)
-    ints = [hparams[h] for h in HPARAMS]
-    fout.write(struct.pack(struct_fmt, *ints))
-
-def read_tokens(fin, hparams):
-    tokens = []
-    for i in range(hparams['n_vocab']):
-        len_b = fin.read(4)
-        (length,) = struct.unpack("i", len_b)
-        word = fin.read(length)
-        score_b = fin.read(4)
-        (score,) = struct.unpack("f", score_b)
-        tokens.append((word, score))
-    return tokens
-
-def write_tokens(fout, tokens):
-    for word, score in tokens:
-        fout.write(struct.pack("i", len(word)))
-        fout.write(word)
-        fout.write(struct.pack("f", score))
-
-def ggml_nelements(shape):
-    r = 1
-    for i in shape:
-        r *= i
-    return r
-
-def ggml_nbytes(shape, ftype):
-    x = ggml_nelements(shape)
-    t = WTYPES[ftype]
-    x *= GGML_TYPE_SIZE[t]
-    x //= GGML_BLCK_SIZE[t]
-    return x
-
-def copy_tensors(fin, fout, part_id, n_parts):
-    while True:
-
-        b = fin.read(4)
-        if not b: break
-        (n_dims,) = struct.unpack("i", b)
-        b = fin.read(4)
-        (length,) = struct.unpack("i", b)
-        b = fin.read(4)
-        (ftype,) = struct.unpack("i", b)
-
-        assert n_dims in (1, 2)
-
-        partshape = list(range(n_dims))
-        for i in range(n_dims):
-            b = fin.read(4)
-            partshape[i] = struct.unpack("i", b)[0]
-        partshape = list(reversed(partshape))
-
-        name = fin.read(length)
-        data = fin.read(ggml_nbytes(partshape, ftype))
-
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
-
-        print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
-
-        # determine dimension along which multipart tensor is sharded
-        #
-        # split_dim 0 regex:
-        #   - output.*
-        #   - layers.*.attention.wq.weight
-        #   - layers.*.attention.wk.weight
-        #   - layers.*.attention.wv.weight
-        #   - layers.*.feed_forward.w1.weight
-        #   - layers.*.feed_forward.w3.weight
-        #
-        # split_dim 1 regex:
-        #   - tok_embeddings.*
-        #   - layers.*.attention.wo.weight
-        #   - layers.*.feed_forward.w2.weight
-        #
-        if n_dims > 1:
-            split_dim = 1
-            if b"tok_embeddings" in name:
-                split_dim = 1
-            elif b"layers" in name:
-                if b"attention.wo.weight" in name:
-                    split_dim = 1
-                elif b"feed_forward.w2.weight" in name:
-                    split_dim = 1
-                else:
-                    split_dim = 0
-            elif b"output" in name:
-                split_dim = 0
-
-        # output tensor header
-        fullshape = list(partshape)
-        if n_dims > 1:
-            fullshape[split_dim] *= n_parts
-        fout.write(struct.pack("iii", n_dims, len(name), ftype))
-        for dim in reversed(fullshape):
-            fout.write(struct.pack("i", dim))
-        fout.write(name)
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
-            fout.write(struct.pack("B", 0))
-            tensor_data_offset += 1
-
-        # output unified mappable tensor data
-        if n_dims == 1 or n_parts == 1:
-            # copy tensor which we thankfully received in one piece
-            if part_id == 0:
-                fout.write(data)
-        elif split_dim == 0:
-            # reassemble multifile tensor containing some of the rows
-            rows_per_chunk = partshape[0]
-            current_row = part_id * rows_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset = current_row * bytes_per_row
-            fout.seek(tensor_data_offset + offset)
-            fout.write(data)
-        elif split_dim == 1:
-            # reassemble multifile tensor containing some of the cols
-            cols_per_chunk = partshape[1]
-            current_col = part_id * cols_per_chunk
-            bpr = partshape[1] // blck_size * type_size
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset_current_col = current_col // blck_size * type_size
-            for row in range(partshape[0]):
-                offset_row = row * bytes_per_row
-                offset = offset_row + offset_current_col
-                fout.seek(tensor_data_offset + offset)
-                fout.write(data[row * bpr:row * bpr + bpr])
-
-        # advance file position to next tensor
-        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
-    parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
-    parser.add_argument('fout_path', help='your new ggjt file name')
-    return parser.parse_args()
-
-def main():
-    args = parse_args()
-    assert args.fin_path
-    assert args.fout_path
-    assert args.fin_path != args.fout_path
-
-    with open(args.fin_path, "rb") as fin:
-        hparams = read_hparams(fin)
-        tokens = read_tokens(fin, hparams)
-
-    if hparams['magic'] == 0x67676a74:  # ggjt
-        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
-        sys.exit(1)
-
-    if hparams['magic'] != 0x67676d66:  # ggmf
-        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
-        sys.exit(1)
-
-    hparams['magic'] = 0x67676a74  # ggjt
-
-    # count number of multipart files by convention
-    n_parts = 1
-    while True:
-        if os.path.exists(f"{args.fin_path}.{n_parts}"):
-            n_parts += 1
-        else:
-            break
-
-    # we output a single file for ggml
-    with open(args.fout_path, "wb") as fout:
-        write_hparams(fout, hparams)
-        write_tokens(fout, tokens)
-        offset_of_tensors = fout.tell()
-        # the tensors we load could be split across multiple files
-        for part_id in range(n_parts):
-            fout.seek(offset_of_tensors)
-            print(f"Processing part {part_id+1} of {n_parts}\n")
-            fin_path = args.fin_path
-            if part_id > 0:
-                fin_path += f".{part_id}"
-            with open(fin_path, "rb") as fin:
-                read_tokens(fin, read_hparams(fin))
-                copy_tensors(fin, fout, part_id, n_parts)
-
-    print(f"Done. Output file: {args.fout_path}\n")
-
-if __name__ == "__main__":
-    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..f3944951a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy==1.24
+sentencepiece==0.1.97

From c14e0d2f23e6d1e785255f4da8c253c1b4723659 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Apr 2023 13:31:15 +0300
Subject: [PATCH 22/34] ggml : always allocate buffers with size multiple of
 GGML_MEM_ALIGN

---
 ggml.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ggml.c b/ggml.c
index d620cd11f..76694a617 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3054,9 +3054,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         return NULL;
     }
 
+    const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
+
     *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ params.mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size),
+        /*.mem_size           =*/ mem_size,
+        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
         /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
         /*.no_alloc           =*/ params.no_alloc,
         /*.n_objects          =*/ 0,
@@ -3066,7 +3068,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.scratch_save       =*/ { 0, 0, NULL, },
     };
 
-    GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
+    GGML_ASSERT(ctx->mem_buffer != NULL);
 
     ggml_assert_aligned(ctx->mem_buffer);
 

From 1623a6e9b46453bff30afd7d0f6c3fd188499c2f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 14 Apr 2023 13:31:29 +0300
Subject: [PATCH 23/34] ggml : minor

---
 ggml.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 76694a617..d99aca21a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -7509,7 +7509,7 @@ static void ggml_compute_forward_rope_f32(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, ((float)-2)/n_dims);
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
 
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -7517,12 +7517,15 @@ static void ggml_compute_forward_rope_f32(
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
+
                 float theta = (float)p;
+
                 for (int i0 = 0; i0 < n_dims; i0 += 2) {
                     const float cos_theta = cosf(theta);
                     const float sin_theta = sinf(theta);
 
                     theta *= theta_scale;
+
                     const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                           float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 
@@ -7583,7 +7586,7 @@ static void ggml_compute_forward_rope_f16(
     // row index used to determine which thread to use
     int ir = 0;
 
-    const float theta_scale = powf(10000.0, ((float)-2)/n_dims);
+    const float theta_scale = powf(10000.0, -2.0f/n_dims);
 
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
@@ -7591,12 +7594,15 @@ static void ggml_compute_forward_rope_f16(
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
+
                 float theta = (float)p;
+
                 for (int i0 = 0; i0 < n_dims; i0 += 2) {
                     const float cos_theta = cosf(theta);
                     const float sin_theta = sinf(theta);
 
                     theta *= theta_scale;
+
                     const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
                           ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
 

From 43ffdefb7424f79a3d510c199e2ea86684b4f824 Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Fri, 14 Apr 2023 14:23:21 +0200
Subject: [PATCH 24/34] py : fix flake8 and isort nitpicks (#960)

---
 convert.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/convert.py b/convert.py
index f35163f67..056dc618d 100644
--- a/convert.py
+++ b/convert.py
@@ -18,10 +18,12 @@ import zipfile
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List,
+                    Literal, Optional, Sequence, Tuple, TypeVar, Union)
+
 import numpy as np
 from sentencepiece import SentencePieceProcessor  # type: ignore
-from typing import (IO, Any, Callable, Iterable, Literal, Optional, Sequence,
-                    TypeVar, Union, List, Dict, Tuple, TYPE_CHECKING)
+
 if TYPE_CHECKING:
     from typing_extensions import TypeAlias
 
@@ -684,7 +686,7 @@ class LazyUnpickler(pickle.Unpickler):
         description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
         return LazyStorage(load=load, kind=pid[1], description=description)
 
-    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName]
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,  # pyright: ignore[reportSelfClsParameterName]
                                requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
         assert isinstance(storage, LazyStorage)
 

From a32f7acc9f54dba1c728cb1e596bd00bf3b4eb5f Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Fri, 14 Apr 2023 15:37:11 +0200
Subject: [PATCH 25/34] py : cleanup dependencies (#962)

after #545 we do not need torch, tqdm and requests in the dependencies
---
 .devops/full.Dockerfile | 5 +++--
 flake.nix               | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index a75bc976f..491d67676 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
     apt-get install -y build-essential python3 python3-pip
 
+COPY requirements.txt requirements.txt
+
 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece tqdm \
-    && pip install torch --index-url https://download.pytorch.org/whl/cpu
+    && pip install -r requirements.txt
 
 WORKDIR /app
 
diff --git a/flake.nix b/flake.nix
index 91d2edd79..5363052b1 100644
--- a/flake.nix
+++ b/flake.nix
@@ -10,7 +10,6 @@
           inherit system;
         };
         llama-python = pkgs.python310.withPackages (ps: with ps; [
-          torch
           numpy
           sentencepiece
         ]);

From c9a59b70a54e0bc05777df287feaea3dbe0310c4 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Fri, 14 Apr 2023 08:43:55 -0600
Subject: [PATCH 26/34] ggml : add unary and binary map operations (#874)

* GGML map ops proof of concept.

* Various cleanups.

Add handling for task setting.

Add handling for ggml_compute_backward.

Rename functions to ggml_map_unary_f32 and ggml_map_binary_f32

Fix compiler warnings related to casting function pointers and `void *`

Reorder functions and definitions based on the GGML op number.

Use typedefs for map op function pointer types.

* Fix position of map ops cases in ggml_compute_forward
---
 ggml.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 ggml.h |  18 +++++
 2 files changed, 237 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index d99aca21a..ce48b78ad 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2712,9 +2712,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
 
     "FLASH_ATTN",
     "FLASH_FF",
+
+    "MAP_UNARY",
+    "MAP_BINARY",
 };
 
-static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
+static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -2757,9 +2760,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
     "flash_attn(x)",
     "flash_ff(x)",
+
+    "f(x)",
+    "f(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
+static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4907,6 +4913,90 @@ struct ggml_tensor * ggml_flash_ff(
     return result;
 }
 
+// ggml_map_unary
+
+struct ggml_tensor * ggml_map_unary_impl_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun,
+        bool   inplace) {
+    bool is_node = false;
+
+    if (!inplace && a->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
+    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
+    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op = GGML_OP_MAP_UNARY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->opt[0] = addr_tensor;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_unary_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_tensor * ggml_map_unary_inplace_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_map_binary
+
+struct ggml_tensor * ggml_map_binary_impl_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun,
+        bool   inplace) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    bool is_node = false;
+
+    if (!inplace && (a->grad || b->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
+    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
+    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op = GGML_OP_MAP_BINARY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+    result->opt[0] = addr_tensor;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_binary_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_tensor * ggml_map_binary_inplace_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 void ggml_set_param(
@@ -8875,6 +8965,111 @@ static void ggml_compute_forward_flash_ff(
     }
 }
 
+// ggml_compute_forward_map_unary
+
+static void ggml_compute_forward_map_unary_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+
+static void ggml_compute_forward_map_unary(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_F16:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+// ggml_compute_forward_map_binary
+
+static void ggml_compute_forward_map_binary_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert( dst->nb[0] == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+    assert(src1->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])),
+                (float *) ((char *) src1->data + i*(src1->nb[1])));
+    }
+}
+
+
+static void ggml_compute_forward_map_binary(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_F16:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 /////////////////////////////////
 
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@@ -9024,6 +9219,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
             } break;
+        case GGML_OP_MAP_UNARY:
+            {
+                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
+                ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_BINARY:
+            {
+                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
+                ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
+            }
+            break;
         case GGML_OP_NONE:
             {
                 // nop
@@ -9283,6 +9490,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // not supported
             } break;
+        case GGML_OP_MAP_UNARY:
+        case GGML_OP_MAP_BINARY:
+            {
+                GGML_ASSERT(false); // not supported
+            } break;
         case GGML_OP_NONE:
             {
                 // nop
@@ -9775,6 +9987,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
                         work_size = MAX(work_size, cur);
                     } break;
+                case GGML_OP_MAP_UNARY:
+                case GGML_OP_MAP_BINARY:
+                    {
+                        node->n_tasks = 1;
+                    } break;
                 case GGML_OP_NONE:
                     {
                         node->n_tasks = 1;
diff --git a/ggml.h b/ggml.h
index c06c09e06..bdff0b4de 100644
--- a/ggml.h
+++ b/ggml.h
@@ -253,6 +253,9 @@ enum ggml_op {
     GGML_OP_FLASH_ATTN,
     GGML_OP_FLASH_FF,
 
+    GGML_OP_MAP_UNARY,
+    GGML_OP_MAP_BINARY,
+
     GGML_OP_COUNT,
 };
 
@@ -652,6 +655,21 @@ struct ggml_tensor * ggml_flash_ff(
         struct ggml_tensor  * c0,
         struct ggml_tensor  * c1);
 
+// Mapping operations
+typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
+typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+struct ggml_tensor * ggml_map_unary_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun);
+
+struct ggml_tensor * ggml_map_binary_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun);
+
 //
 // automatic differentiation
 //

From f4d277ae17247ee51129ef1a9ff74d377cc90b1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Pazdiora?= <tomas.pazdiora@gmail.com>
Date: Fri, 14 Apr 2023 17:19:17 +0200
Subject: [PATCH 27/34] main : alternative instruct mode (Vicuna support, etc.)
 (#863)

* Add support for configs, add configurable prefixes / suffixes, deprecate instruct mode, add stop prompt

* Add multiline mode, update text input.

* bugfix

* update implementation

* typos

* Change --multiline implementation to be toggled by EOF.

* bugfix

* default multiline mode

* add more configs

* update formating

* update formatting

* apply suggestions
---
 configs/alpaca-native-enhanced.txt |  21 +++
 configs/alpaca.txt                 |   9 +
 configs/chat-with-bob.txt          |  15 ++
 configs/llama.txt                  |   3 +
 configs/vicuna-simple.txt          |   7 +
 configs/vicuna-stop.txt            |   8 +
 configs/vicuna.txt                 |   9 +
 examples/common.cpp                | 284 +++++++++++++++++++++++------
 examples/common.h                  |  30 ++-
 examples/main/main.cpp             | 172 +++++++++++------
 prompts/alpaca.txt                 |   1 -
 prompts/chat-with-bob.txt          |   7 -
 12 files changed, 445 insertions(+), 121 deletions(-)
 create mode 100644 configs/alpaca-native-enhanced.txt
 create mode 100644 configs/alpaca.txt
 create mode 100644 configs/chat-with-bob.txt
 create mode 100644 configs/llama.txt
 create mode 100644 configs/vicuna-simple.txt
 create mode 100644 configs/vicuna-stop.txt
 create mode 100644 configs/vicuna.txt
 delete mode 100644 prompts/alpaca.txt
 delete mode 100644 prompts/chat-with-bob.txt

diff --git a/configs/alpaca-native-enhanced.txt b/configs/alpaca-native-enhanced.txt
new file mode 100644
index 000000000..109d31592
--- /dev/null
+++ b/configs/alpaca-native-enhanced.txt
@@ -0,0 +1,21 @@
+--ctx_size 2048
+--batch_size 16
+--repeat_penalty 1.15
+--temp 0.4
+--top_k 30
+--top_p 0.18
+
+--interactive-first
+--keep -1
+
+--ins-prefix-bos
+--ins-prefix "\n\nUser: "
+--ins-suffix "\n\nAssistant: "
+--reverse-prompt "User: "
+
+-p "You are an AI language model designed to assist the User by answering their questions, offering advice, and engaging in casual conversation in a friendly, helpful, and informative manner. You respond clearly, coherently, and you consider the conversation history.
+
+User: Hey, how's it going?
+
+Assistant: Hey there! I'm doing great, thank you. What can I help you with today? Let's have a fun chat!"
+
diff --git a/configs/alpaca.txt b/configs/alpaca.txt
new file mode 100644
index 000000000..99a3ab47e
--- /dev/null
+++ b/configs/alpaca.txt
@@ -0,0 +1,9 @@
+--clean-interface
+--interactive-first
+--keep -1
+--ins-prefix-bos
+--ins-prefix "\n\n### Instruction:\n\n"
+--ins-suffix "\n\n### Response:\n\n"
+--reverse-prompt "### Instruction:\n\n"
+
+-p "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
diff --git a/configs/chat-with-bob.txt b/configs/chat-with-bob.txt
new file mode 100644
index 000000000..0caa749a3
--- /dev/null
+++ b/configs/chat-with-bob.txt
@@ -0,0 +1,15 @@
+--interactive-first
+--keep -1
+--ins-prefix-bos
+--ins-prefix "\nUser: "
+--ins-suffix "\nBob: "
+--reverse-prompt "User: "
+--rm-trailing-space-workaround
+
+-p "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, Bob.
+Bob: Hello. How may I help you today?
+User: Please tell me the largest city in Europe.
+Bob: Sure. The largest city in Europe is Moscow, the capital of Russia."
+
diff --git a/configs/llama.txt b/configs/llama.txt
new file mode 100644
index 000000000..9d23e75ac
--- /dev/null
+++ b/configs/llama.txt
@@ -0,0 +1,3 @@
+--interactive-first
+--keep -1
+--temp 0.1
diff --git a/configs/vicuna-simple.txt b/configs/vicuna-simple.txt
new file mode 100644
index 000000000..efa60d96a
--- /dev/null
+++ b/configs/vicuna-simple.txt
@@ -0,0 +1,7 @@
+--interactive-first
+--keep -1
+--ins-prefix-bos
+--ins-prefix "\n### Human: "
+--ins-suffix "\n### Assistant: "
+--reverse-prompt "### Human: "
+--rm-trailing-space-workaround
diff --git a/configs/vicuna-stop.txt b/configs/vicuna-stop.txt
new file mode 100644
index 000000000..911d067ef
--- /dev/null
+++ b/configs/vicuna-stop.txt
@@ -0,0 +1,8 @@
+--interactive-first
+--keep -1
+--ins-prefix-bos
+--ins-prefix "\n### Human: "
+--ins-suffix "\n### Assistant: "
+--reverse-prompt "### Human: "
+--stop-prompt "### Assistant: "
+--rm-trailing-space-workaround
diff --git a/configs/vicuna.txt b/configs/vicuna.txt
new file mode 100644
index 000000000..6d811410a
--- /dev/null
+++ b/configs/vicuna.txt
@@ -0,0 +1,9 @@
+--interactive-first
+--keep -1
+--ins-prefix-bos
+--ins-prefix "\n### Human: "
+--ins-suffix "\n### Assistant: "
+--reverse-prompt "### Human: "
+--rm-trailing-space-workaround
+
+-p "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
diff --git a/examples/common.cpp b/examples/common.cpp
index 0772dbfe1..eaa5aceea 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -2,10 +2,13 @@
 
 #include <cassert>
 #include <cstring>
+#include <iostream>
 #include <fstream>
+#include <sstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
+#include <regex>
 
 #if defined (_WIN32)
 #include <fcntl.h>
@@ -23,6 +26,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int
 #define CP_UTF8 65001
 #endif
 
+void split_args(const std::string & args_string, std::vector<std::string> & output_args)
+{
+    std::string current_arg = "";
+    bool in_quotes = false;
+    char quote_type;
+
+    for (char c : args_string) {
+        if (c == '"' || c == '\'') {
+            if (!in_quotes) {
+                in_quotes = true;
+                quote_type = c;
+            } else if (quote_type == c) {
+                in_quotes = false;
+            } else {
+                current_arg += c;
+            }
+        } else if (in_quotes) {
+            current_arg += c;
+        } else if (std::isspace(c)) {
+            if (current_arg != "") {
+                output_args.push_back(current_arg);
+                current_arg = "";
+            }
+        } else {
+            current_arg += c;
+        }
+    }
+
+    if (current_arg != "") {
+        output_args.push_back(current_arg);
+    }
+}
+
+std::string unescape(const std::string & str) {
+    return std::regex_replace(str, std::regex("\\\\n"), "\n");
+}
+
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     // determine sensible default number of threads.
     // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
@@ -40,28 +80,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     std::string arg;
     gpt_params default_params;
 
+    // get additional arguments from config files
+    std::vector<std::string> args;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoi(argv[i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "-f" || arg == "--file") {
+        if (arg == "--config") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -72,85 +95,153 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
+            std::string args_string;
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(args_string));
+            if (args_string.back() == '\n') {
+                args_string.pop_back();
+            }
+            split_args(args_string, args);
+            for (int j = 0; j < args.size(); j++) {
+                args[j] = unescape(args[j]);
+            }
+        } else {
+            args.emplace_back(argv[i]);
+        }
+    }
+
+    // parse args
+    int args_c = static_cast<int>(args.size());
+    for (int i = 0; i < args_c && !invalid_param; i++) {
+        arg = args[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            if (++i >= args_c) {
+                invalid_param = true;
+                break;
+            }
+            params.seed = std::stoi(args[i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= args_c) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(args[i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= args_c) {
+                invalid_param = true;
+                break;
+            }
+            params.prompt = args[i];
+        } else if (arg == "-f" || arg == "--file") {
+            if (++i >= args_c) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(args[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", args[i].c_str());
+                invalid_param = true;
+                break;
+            }
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
             if (params.prompt.back() == '\n') {
                 params.prompt.pop_back();
             }
         } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.n_predict = std::stoi(argv[i]);
+            params.n_predict = std::stoi(args[i]);
         } else if (arg == "--top_k") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.top_k = std::stoi(argv[i]);
+            params.top_k = std::stoi(args[i]);
         } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.n_ctx = std::stoi(argv[i]);
+            params.n_ctx = std::stoi(args[i]);
         } else if (arg == "--memory_f32") {
             params.memory_f16 = false;
         } else if (arg == "--top_p") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.top_p = std::stof(argv[i]);
+            params.top_p = std::stof(args[i]);
         } else if (arg == "--temp") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.temp = std::stof(argv[i]);
+            params.temp = std::stof(args[i]);
         } else if (arg == "--repeat_last_n") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_last_n = std::stoi(argv[i]);
+            params.repeat_last_n = std::stoi(args[i]);
         } else if (arg == "--repeat_penalty") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_penalty = std::stof(argv[i]);
+            params.repeat_penalty = std::stof(args[i]);
         } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.n_batch = std::stoi(argv[i]);
+            params.n_batch = std::stoi(args[i]);
             params.n_batch = std::min(512, params.n_batch);
         } else if (arg == "--keep") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.n_keep = std::stoi(argv[i]);
+            params.n_keep = std::stoi(args[i]);
         } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.model = argv[i];
+            params.model = args[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--embedding") {
             params.embedding = true;
+        } else if (arg == "--clean-interface") {
+            params.clean_interface = true;
         } else if (arg == "--interactive-start") {
             params.interactive = true;
         } else if (arg == "--interactive-first") {
             params.interactive_start = true;
         } else if (arg == "-ins" || arg == "--instruct") {
-            params.instruct = true;
+            fprintf(stderr, "\n\nWarning: instruct mode is deprecated! Use: \n"
+                "--clean-interface "
+                "--interactive-first "
+                "--keep -1 "
+                "--ins-prefix-bos "
+                "--ins-prefix \"\\n\\n### Instruction:\\n\\n\" "
+                "--ins-suffix \"\\n\\n### Response:\\n\\n\" "
+                "-r \"### Instruction:\\n\\n\" "
+            "\n\n");
+            // params.instruct = true;
+            params.clean_interface = true;
+            params.interactive_start = true;
+            params.n_keep = -1;
+            params.instruct_prefix_bos = true;
+            params.instruct_prefix = "\n\n### Instruction:\n\n";
+            params.instruct_suffix = "\n\n### Response:\n\n";
+            params.antiprompt.push_back("### Instruction:\n\n");
         } else if (arg == "--color") {
             params.use_color = true;
+        } else if (arg == "--disable-multiline") {
+            params.multiline_mode = false;
         } else if (arg == "--mlock") {
             params.use_mlock = true;
         } else if (arg == "--no-mmap") {
@@ -160,65 +251,94 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.antiprompt.push_back(argv[i]);
+            params.antiprompt.push_back(args[i]);
+        } else if (arg == "--stop-prompt") {
+            if (++i >= args_c) {
+                invalid_param = true;
+                break;
+            }
+            params.stopprompt.push_back(args[i]);
+        } else if (arg == "--rm-trailing-space-workaround") {
+            params.rm_trailing_space_workaround = true;
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--n_parts") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.n_parts = std::stoi(argv[i]);
+            params.n_parts = std::stoi(args[i]);
         } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, default_params);
+            gpt_print_usage(argv[0], default_params);
             exit(0);
         } else if (arg == "--random-prompt") {
             params.random_prompt = true;
         } else if (arg == "--in-prefix") {
-            if (++i >= argc) {
+            if (++i >= args_c) {
                 invalid_param = true;
                 break;
             }
-            params.input_prefix = argv[i];
+            params.input_prefix = args[i];
+        } else if (arg == "--ins-prefix-bos") {
+            params.instruct_prefix_bos = true;
+        } else if (arg == "--ins-prefix") {
+            if (++i >= args_c) {
+                invalid_param = true;
+                break;
+            }
+            params.instruct_prefix = args[i];
+        } else if (arg == "--ins-suffix-bos") {
+            params.instruct_suffix_bos = true;
+        } else if (arg == "--ins-suffix") {
+            if (++i >= args_c) {
+                invalid_param = true;
+                break;
+            }
+            params.instruct_suffix = args[i];
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, default_params);
+            gpt_print_usage(argv[0], default_params);
             exit(1);
         }
     }
     if (invalid_param) {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, default_params);
+        gpt_print_usage(argv[0], default_params);
         exit(1);
     }
 
     return true;
 }
 
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+void gpt_print_usage(char * argv_0, const gpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv_0);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
     fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
     fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stderr, "  --clean-interface     hides input prefix & suffix and displays '>' instead\n");
     fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
     fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
     fprintf(stderr, "                        specified more than once for multiple prompts).\n");
     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
+    fprintf(stderr, "  --disable-multiline   disable multiline mode (use Ctrl+D on Linux/Mac and Ctrl+Z then Return on Windows to toggle multiline)\n");
     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
     fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
     fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
+    fprintf(stderr, "  --ins-prefix STRING   (instruct) prefix user inputs with tokenized string (default: empty)\n");
+    fprintf(stderr, "  --ins-prefix-bos      (instruct) prepend bos token to instruct prefix.\n");
+    fprintf(stderr, "  --ins-suffix STRING   (instruct) suffix user inputs with tokenized string (default: empty)\n");
+    fprintf(stderr, "  --ins-suffix-bos      (instruct) prepend bos token to instruct suffix.\n");
     fprintf(stderr, "  -f FNAME, --file FNAME\n");
     fprintf(stderr, "                        prompt file to start generation.\n");
     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
@@ -328,3 +448,61 @@ void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
     str = strTo;
 }
 #endif
+
+bool get_input_text(std::string & input_text, bool eof_toggled_multiline_mode) {
+    bool another_line = true;
+    bool is_eof_multiline_toggled = false;
+    do {
+        std::string line;
+#if defined(_WIN32)
+        auto & stdcin = std::wcin;
+        std::wstring wline;
+        if (!std::getline(stdcin, wline)) {
+            // input stream is bad or EOF received
+            if (stdcin.bad()) {
+                fprintf(stderr, "%s: error: input stream bad\n", __func__);
+                return 1;
+            }
+        }
+        win32_utf8_encode(wline, line);
+#else
+        auto & stdcin = std::cin;
+        if (!std::getline(stdcin, line)) {
+            // input stream is bad or EOF received
+            if (stdcin.bad()) {
+                fprintf(stderr, "%s: error: input stream bad\n", __func__);
+                return 1;
+            }
+        }
+#endif
+        if (stdcin.eof()) {
+            stdcin.clear();
+            stdcin.seekg(0, std::ios::beg);
+            if (!eof_toggled_multiline_mode) {
+                another_line = false;
+            } else {
+                is_eof_multiline_toggled = !is_eof_multiline_toggled;
+                if (is_eof_multiline_toggled) {
+                    input_text += line;
+                    continue;
+                }
+            }
+        }
+        if (!eof_toggled_multiline_mode) {
+            if (line.empty() || line.back() != '\\') {
+                another_line = false;
+            } else {
+                line.pop_back(); // Remove the continue character
+            }
+        } else {
+            if (!is_eof_multiline_toggled) {
+                another_line = false;
+            }
+        }
+        input_text += line;
+        if (another_line) {
+            input_text += '\n'; // Append the line to the result
+        }
+    } while (another_line);
+    return true;
+}
diff --git a/examples/common.h b/examples/common.h
index 1ea6f7445..df8e4c6cc 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -14,14 +14,14 @@
 //
 
 struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict     = 128;  // new tokens to predict
-    int32_t repeat_last_n = 64;   // last n tokens to penalize
-    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 8;    // batch size for prompt processing
-    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
+    int32_t seed          = -1;    // RNG seed
+    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency()); // max 4 threads (default)
+    int32_t n_predict     = 128;   // new tokens to predict
+    int32_t repeat_last_n = 64;    // last n tokens to penalize
+    int32_t n_parts       = -1;    // amount of model parts (-1 = determine from model dimensions)
+    int32_t n_ctx         = 512;   // context size
+    int32_t n_batch       = 8;     // batch size for prompt processing
+    int32_t n_keep        = 0;     // number of tokens to keep from initial prompt (-1 for all)
 
     // sampling parameters
     int32_t top_k = 40;
@@ -33,8 +33,15 @@ struct gpt_params {
     std::string prompt = "";
     std::string input_prefix = ""; // string to prefix user inputs with
 
+    std::string instruct_prefix = ""; // prefix user inputs with tokenized string
+    bool instruct_prefix_bos = false; // prepend bos token to instruct prefix
+    std::string instruct_suffix = ""; // suffix user inputs with tokenized string
+    bool instruct_suffix_bos = false; // prepend bos token to instruct suffix
 
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+    std::vector<std::string> stopprompt; // string upon seeing which more user input is prompted (without adding instruct prefixes and suffixes)
+
+    bool rm_trailing_space_workaround = false; // workaround for removing trailing space from reverse/stop prompts
 
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
@@ -51,11 +58,14 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
     bool verbose_prompt    = false; // print prompt tokens before generation
+
+    bool clean_interface   = false; // hides input prefix & suffix and displays '>'
+    bool multiline_mode    = true; // enables multi-line mode, to send input press CTRL+D on Linux/Max, Ctrl+Z then Return on Windows
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+void gpt_print_usage(char * argv_0, const gpt_params & params);
 
 std::string gpt_random_prompt(std::mt19937 & rng);
 
@@ -95,3 +105,5 @@ void set_console_color(console_state & con_st, console_color_t color);
 void win32_console_init(bool enable_color);
 void win32_utf8_encode(const std::wstring & wstr, std::string & str);
 #endif
+
+bool get_input_text(std::string & input_text, bool escape_newline_mode);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ba153cb82..68b4b2840 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -30,7 +30,8 @@ static bool is_interacting = false;
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
     set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    printf("\n"); // this also force flush stdout.
+    fflush(stdout);
+    fflush(stderr);
     if (signo == SIGINT) {
         if (!is_interacting) {
             is_interacting=true;
@@ -89,6 +90,8 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
+    bool instruct_mode = !params.instruct_prefix.empty() || !params.instruct_suffix.empty();
+
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";
 
@@ -153,22 +156,20 @@ int main(int argc, char ** argv) {
     }
 
     // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
+    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) {
         params.n_keep = (int)embd_inp.size();
     }
 
     // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
-
-    // in instruct mode, we inject a prefix and a suffix to each input by the user
-    if (params.instruct) {
-        params.interactive_start = true;
-        params.antiprompt.push_back("### Instruction:\n\n");
+    const auto inp_pfx = ::llama_tokenize(ctx, params.instruct_prefix, params.instruct_prefix_bos);
+    std::string instruct_suffix = params.instruct_suffix;
+    if (params.rm_trailing_space_workaround) {
+        if (instruct_suffix.back() == ' ') { instruct_suffix.pop_back(); }
     }
+    const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos);
 
     // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_start) {
+    if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) {
         params.interactive = true;
     }
 
@@ -210,10 +211,21 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
             }
         }
+        if (params.stopprompt.size()) {
+            for (auto stopprompt : params.stopprompt) {
+                fprintf(stderr, "Stop prompt: '%s'\n", stopprompt.c_str());
+            }
+        }
 
         if (!params.input_prefix.empty()) {
             fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
         }
+        if (!params.instruct_prefix.empty()) {
+            fprintf(stderr, "Instruct prefix %s: '%s'\n", params.instruct_prefix_bos ? "(with bos token)" : "", params.instruct_prefix.c_str());
+        }
+        if (!params.instruct_suffix.empty()) {
+            fprintf(stderr, "Instruct suffix %s: '%s'\n", params.instruct_suffix_bos ? "(with bos token)" : "", params.instruct_suffix.c_str());
+        }
     }
     fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
         params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
@@ -229,12 +241,29 @@ int main(int argc, char ** argv) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
                " - Press Ctrl+C to interject at any time.\n"
 #endif
-               " - Press Return to return control to LLaMa.\n"
-               " - If you want to submit another line, end your input in '\\'.\n\n");
+        );
+        if (params.multiline_mode) {
+            fprintf(stderr, " - Press Return to return control to LLaMa.\n"
+#if defined (_WIN32)
+                            " - [MULTILINE MODE] Press Ctrl+Z then Return (EOF) to toggle.\n\n");
+#else
+                            " - [MULTILINE MODE] Press Ctrl+D (EOF) to toggle.\n\n");
+#endif
+        }
+        else {
+            fprintf(stderr, " - Press Return to return control to LLaMa.\n"
+                            " - If you want to submit another line, end your input in '\\'.\n\n");
+        }
         is_interacting = params.interactive_start;
     }
 
-    bool is_antiprompt = false;
+    struct Antiprompt {
+        bool any = false;
+        bool trailing_space = false;
+        size_t len;
+        bool is_stop_prompt = false;
+    } antiprompt;
+
     bool input_noecho  = false;
 
     int n_past     = 0;
@@ -304,7 +333,7 @@ int main(int argc, char ** argv) {
             }
 
             // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !params.instruct) {
+            if (id == llama_token_eos() && params.interactive && !instruct_mode) {
                 id = llama_token_newline.front();
                 if (params.antiprompt.size() != 0) {
                     // tokenize and inject first reverse prompt
@@ -350,27 +379,72 @@ int main(int argc, char ** argv) {
         // check if we should prompt the user for more
         if (params.interactive && (int) embd_inp.size() <= n_consumed) {
 
-            // check for reverse prompt
-            if (params.antiprompt.size()) {
+            // check for reverse prompt or stop prompt
+            if (params.antiprompt.size() || params.stopprompt.size()) {
                 std::string last_output;
                 for (auto id : last_n_tokens) {
                     last_output += llama_token_to_str(ctx, id);
                 }
 
-                is_antiprompt = false;
+                antiprompt.any = false;
+                antiprompt.is_stop_prompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
-                for (std::string & antiprompt : params.antiprompt) {
-                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
+                for (std::string & prompt : params.antiprompt) {
+                    if (params.rm_trailing_space_workaround) {
+                        antiprompt.trailing_space = prompt.back() == ' ';
+                        antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
+                    }
+                    if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
                         is_interacting = true;
-                        is_antiprompt = true;
+                        antiprompt.any = true;
                         set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
                         fflush(stdout);
                         break;
                     }
                 }
+                if (!antiprompt.any) {
+                    for (std::string & prompt : params.stopprompt) {
+                        if (params.rm_trailing_space_workaround) {
+                            antiprompt.trailing_space = prompt.back() == ' ';
+                            antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
+                        }
+                        if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
+                            is_interacting = true;
+                            antiprompt.any = true;
+                            antiprompt.is_stop_prompt = true;
+                            set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                            fflush(stdout);
+                            break;
+                        }
+                    }
+                }
             }
 
-            if (n_past > 0 && is_interacting) {
+            if (n_past > 0 && is_interacting)
+            {
+                std::string buffer;
+                if (!params.clean_interface && !params.instruct_prefix.empty() && !antiprompt.any) {
+                    // avoid printing again user's new line (TODO: try to revert enter press and print newline)
+                    int i = params.instruct_prefix.front() == '\n' ? 1 : 0;
+                    for (; i < inp_pfx.size(); i++) {
+                        printf("%s", llama_token_to_str(ctx, inp_pfx[i]));
+                    }
+                    fflush(stdout);
+                }
+                if (params.rm_trailing_space_workaround) {
+                    // add only if not stopprompt (as stopprompt could be used to pause
+                        //     assistant and then continue without input - adding back trailing
+                        //     space may mess it up.)
+                    if (!antiprompt.is_stop_prompt && antiprompt.any && antiprompt.trailing_space) {
+                        // add back removed trailing space to buffer(workaround)
+                        buffer += ' ';
+                        if (!params.clean_interface) {
+                            printf("%s", buffer.c_str());
+                        }
+                        fflush(stdout);
+                    }
+                }
+
                 // potentially set color to indicate we are taking user input
                 set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
 
@@ -379,49 +453,45 @@ int main(int argc, char ** argv) {
                 signal(SIGINT, sigint_handler);
 #endif
 
-                if (params.instruct) {
+                if (params.clean_interface) {
                     printf("\n> ");
                 }
 
-                std::string buffer;
                 if (!params.input_prefix.empty()) {
                     buffer += params.input_prefix;
                     printf("%s", buffer.c_str());
                 }
 
-                std::string line;
-                bool another_line = true;
-                do {
-#if defined(_WIN32)
-                    std::wstring wline;
-                    if (!std::getline(std::wcin, wline)) {
-                        // input stream is bad or EOF received
-                        return 0;
-                    }
-                    win32_utf8_encode(wline, line);
-#else
-                    if (!std::getline(std::cin, line)) {
-                        // input stream is bad or EOF received
-                        return 0;
-                    }
-#endif
-                    if (line.empty() || line.back() != '\\') {
-                        another_line = false;
-                    } else {
-                        line.pop_back(); // Remove the continue character
-                    }
-                    buffer += line + '\n'; // Append the line to the result
-                } while (another_line);
+                if (!get_input_text(buffer, params.multiline_mode)) {
+                    // input stream is bad
+                    return 1;
+                }
+                if (!antiprompt.is_stop_prompt) {
+                    buffer += "\n";
+                }
 
                 // done taking input, reset color
                 set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
 
+                if (!params.clean_interface && !params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
+                    // avoid printing again user's new line (TODO: try to revert enter press and print newline)
+                    int i = params.instruct_suffix.front() == '\n' ? 1 : 0;
+                    for (; i < inp_sfx.size(); i++) {
+                        printf("%s", llama_token_to_str(ctx, inp_sfx[i]));
+                    }
+                    // if (remove trailing space workaround) {
+                    //     We won't add back removed trailing space here, because assistant continues here,
+                    //         and it may mess up it's output (remove trailing space workaround).
+                    // }
+                    fflush(stdout);
+                }
+
                 // Add tokens to embd only if the input buffer is non-empty
                 // Entering a empty line lets the user pass control back
                 if (buffer.length() > 1) {
 
-                    // instruct mode: insert instruction prefix
-                    if (params.instruct && !is_antiprompt) {
+                    // insert input prefix
+                    if (!params.instruct_prefix.empty() && !antiprompt.any) {
                         n_consumed = embd_inp.size();
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
@@ -429,8 +499,8 @@ int main(int argc, char ** argv) {
                     auto line_inp = ::llama_tokenize(ctx, buffer, false);
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
-                    // instruct mode: insert response suffix
-                    if (params.instruct) {
+                    // insert response suffix
+                    if (!params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
                         embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                     }
 
@@ -447,7 +517,7 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (!embd.empty() && embd.back() == llama_token_eos()) {
-            if (params.instruct) {
+            if (instruct_mode) {
                 is_interacting = true;
             } else {
                 fprintf(stderr, " [end of text]\n");
diff --git a/prompts/alpaca.txt b/prompts/alpaca.txt
deleted file mode 100644
index 2224bdeb0..000000000
--- a/prompts/alpaca.txt
+++ /dev/null
@@ -1 +0,0 @@
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt
deleted file mode 100644
index ad494d831..000000000
--- a/prompts/chat-with-bob.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
-
-User: Hello, Bob.
-Bob: Hello. How may I help you today?
-User: Please tell me the largest city in Europe.
-Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
\ No newline at end of file

From c56b7152690ca25cfd66b20210b3629e6c1e739b Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Fri, 14 Apr 2023 20:05:37 +0200
Subject: [PATCH 28/34] Expose type name from ggml (#970)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid duplication of type names in utils

Co-authored-by: Håkon H. Hitland <haakon@likedan.net>
---
 examples/quantize-stats/quantize-stats.cpp | 14 ++++++--------
 ggml.c                                     | 17 +++++++++++++++++
 ggml.h                                     |  2 ++
 llama.cpp                                  | 14 ++------------
 4 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index c786fe208..050300931 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -16,9 +16,6 @@
 #include <unordered_map>
 #include <vector>
 
-static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
-static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
-
 struct quantize_stats_params {
     std::string model = "models/7B/ggml-model-f16.bin";
     bool verbose = false;
@@ -224,7 +221,7 @@ int main(int argc, char ** argv) {
                 break;
             }
             int j;
-            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
                 // find match
             }
             if (j < GGML_TYPE_COUNT) {
@@ -279,7 +276,7 @@ int main(int argc, char ** argv) {
             continue;
         }
         if (params.verbose) {
-            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
         }
         if (kv_tensor.second->type == GGML_TYPE_F16) {
             is_f16 = true;
@@ -304,13 +301,14 @@ int main(int argc, char ** argv) {
 
     // loop throught quantization types
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const ggml_type type = (ggml_type) i;
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
         quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             if (params.verbose) {
-                printf("testing %s ...\n",  type_strs[i]);
+                printf("testing %s ...\n",  ggml_type_name(type));
             }
 
             error_stats global_stats {};
@@ -322,7 +320,7 @@ int main(int argc, char ** argv) {
                 if (params.verbose) {
                     printf("  %s ...\n",  kv_tensor.first.c_str());
                 }
-                std::string layer_name { type_strs[i] };
+                std::string layer_name { ggml_type_name(type) };
                 layer_name += "::" + kv_tensor.first;
                 test_roundtrip_on_layer(
                         layer_name,
@@ -337,7 +335,7 @@ int main(int argc, char ** argv) {
                 );
             }
 
-            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
         }
     }
 
diff --git a/ggml.c b/ggml.c
index ce48b78ad..1574d6498 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2671,6 +2671,18 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
 };
 static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
 
+
+static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32]  = "f32",
+    [GGML_TYPE_F16]  = "f16",
+    [GGML_TYPE_Q4_0] = "q4_0",
+    [GGML_TYPE_Q4_1] = "q4_1",
+    [GGML_TYPE_I8]   = "i8",
+    [GGML_TYPE_I16]  = "i16",
+    [GGML_TYPE_I32]  = "i32",
+};
+static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated");
+
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",
 
@@ -2895,6 +2907,11 @@ float ggml_type_sizef(enum ggml_type type) {
     return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
 }
 
+const char * ggml_type_name(enum ggml_type type) {
+    return GGML_TYPE_NAME[type];
+}
+
+
 size_t ggml_element_size(const struct ggml_tensor * tensor) {
     return GGML_TYPE_SIZE[tensor->type];
 }
diff --git a/ggml.h b/ggml.h
index bdff0b4de..617298a95 100644
--- a/ggml.h
+++ b/ggml.h
@@ -354,6 +354,8 @@ int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 
+const char * ggml_type_name(enum ggml_type type);
+
 size_t ggml_element_size(const struct ggml_tensor * tensor);
 
 struct ggml_context * ggml_init(struct ggml_init_params params);
diff --git a/llama.cpp b/llama.cpp
index c72295684..be8c4cdc1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -269,16 +269,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
     return ret;
 }
 
-static const char * llama_format_type(enum ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_F32: return "f32";
-        case GGML_TYPE_F16: return "f16";
-        case GGML_TYPE_Q4_0: return "q4_0";
-        case GGML_TYPE_Q4_1: return "q4_1";
-        default: LLAMA_ASSERT(false);
-    }
-}
-
 static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
     size_t size = ggml_type_size(type);
     for (uint32_t dim : ne) {
@@ -1582,7 +1572,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         printf("[%zu/%zu] %36s - %s, type = %6s, ",
                ++idx, model_loader->tensors_map.tensors.size(),
                tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
-               llama_format_type(tensor.type));
+               ggml_type_name(tensor.type));
 
         // This used to be a regex, but <regex> has an extreme cost to compile times.
         bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
@@ -1615,7 +1605,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                     f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
                 }
             } else {
-                throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
+                throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
             }
 
             printf("quantizing .. ");

From 93265e988af32b8be314bfed334f795a3037555d Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Fri, 14 Apr 2023 19:39:48 +0000
Subject: [PATCH 29/34] make : fix dependencies, use auto variables (#983)

---
 Makefile | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index 7db246650..a1b99c6f9 100644
--- a/Makefile
+++ b/Makefile
@@ -140,44 +140,44 @@ default: main quantize perplexity embedding
 #
 
 ggml.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+	$(CC)  $(CFLAGS)   -c $< -o $@
 
-llama.o: llama.cpp llama.h llama_util.h
-	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
+llama.o: llama.cpp ggml.h llama.h llama_util.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common.o: examples/common.cpp examples/common.h
-	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 clean:
 	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
 
 main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
 quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
 quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
 embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
 libllama.so: llama.o ggml.o
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 #
 # Tests
 #
 
-benchmark: ggml.o
-	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
+benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
+	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
 	./benchmark-q4_0-matmult
 
 .PHONY: tests

From 489093548c89c67520109ab25c4df4a4614a32a0 Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Fri, 14 Apr 2023 21:46:49 +0200
Subject: [PATCH 30/34] py : bump sentencepiece to 0.1.98 to support Python
 3.11 (#976)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index f3944951a..6c32cbd04 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
 numpy==1.24
-sentencepiece==0.1.97
+sentencepiece==0.1.98

From c85e03d12e4b8af22cb13aa9c618dcd5935862fd Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Fri, 14 Apr 2023 21:58:43 +0200
Subject: [PATCH 31/34] Revert "main : alternative instruct mode (Vicuna
 support, etc.) (#863)" (#982)

This reverts commit f4d277ae17247ee51129ef1a9ff74d377cc90b1b.
---
 configs/alpaca-native-enhanced.txt |  21 ---
 configs/alpaca.txt                 |   9 -
 configs/chat-with-bob.txt          |  15 --
 configs/llama.txt                  |   3 -
 configs/vicuna-simple.txt          |   7 -
 configs/vicuna-stop.txt            |   8 -
 configs/vicuna.txt                 |   9 -
 examples/common.cpp                | 284 ++++++-----------------------
 examples/common.h                  |  30 +--
 examples/main/main.cpp             | 172 ++++++-----------
 prompts/alpaca.txt                 |   1 +
 prompts/chat-with-bob.txt          |   7 +
 12 files changed, 121 insertions(+), 445 deletions(-)
 delete mode 100644 configs/alpaca-native-enhanced.txt
 delete mode 100644 configs/alpaca.txt
 delete mode 100644 configs/chat-with-bob.txt
 delete mode 100644 configs/llama.txt
 delete mode 100644 configs/vicuna-simple.txt
 delete mode 100644 configs/vicuna-stop.txt
 delete mode 100644 configs/vicuna.txt
 create mode 100644 prompts/alpaca.txt
 create mode 100644 prompts/chat-with-bob.txt

diff --git a/configs/alpaca-native-enhanced.txt b/configs/alpaca-native-enhanced.txt
deleted file mode 100644
index 109d31592..000000000
--- a/configs/alpaca-native-enhanced.txt
+++ /dev/null
@@ -1,21 +0,0 @@
---ctx_size 2048
---batch_size 16
---repeat_penalty 1.15
---temp 0.4
---top_k 30
---top_p 0.18
-
---interactive-first
---keep -1
-
---ins-prefix-bos
---ins-prefix "\n\nUser: "
---ins-suffix "\n\nAssistant: "
---reverse-prompt "User: "
-
--p "You are an AI language model designed to assist the User by answering their questions, offering advice, and engaging in casual conversation in a friendly, helpful, and informative manner. You respond clearly, coherently, and you consider the conversation history.
-
-User: Hey, how's it going?
-
-Assistant: Hey there! I'm doing great, thank you. What can I help you with today? Let's have a fun chat!"
-
diff --git a/configs/alpaca.txt b/configs/alpaca.txt
deleted file mode 100644
index 99a3ab47e..000000000
--- a/configs/alpaca.txt
+++ /dev/null
@@ -1,9 +0,0 @@
---clean-interface
---interactive-first
---keep -1
---ins-prefix-bos
---ins-prefix "\n\n### Instruction:\n\n"
---ins-suffix "\n\n### Response:\n\n"
---reverse-prompt "### Instruction:\n\n"
-
--p "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
diff --git a/configs/chat-with-bob.txt b/configs/chat-with-bob.txt
deleted file mode 100644
index 0caa749a3..000000000
--- a/configs/chat-with-bob.txt
+++ /dev/null
@@ -1,15 +0,0 @@
---interactive-first
---keep -1
---ins-prefix-bos
---ins-prefix "\nUser: "
---ins-suffix "\nBob: "
---reverse-prompt "User: "
---rm-trailing-space-workaround
-
--p "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
-
-User: Hello, Bob.
-Bob: Hello. How may I help you today?
-User: Please tell me the largest city in Europe.
-Bob: Sure. The largest city in Europe is Moscow, the capital of Russia."
-
diff --git a/configs/llama.txt b/configs/llama.txt
deleted file mode 100644
index 9d23e75ac..000000000
--- a/configs/llama.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---interactive-first
---keep -1
---temp 0.1
diff --git a/configs/vicuna-simple.txt b/configs/vicuna-simple.txt
deleted file mode 100644
index efa60d96a..000000000
--- a/configs/vicuna-simple.txt
+++ /dev/null
@@ -1,7 +0,0 @@
---interactive-first
---keep -1
---ins-prefix-bos
---ins-prefix "\n### Human: "
---ins-suffix "\n### Assistant: "
---reverse-prompt "### Human: "
---rm-trailing-space-workaround
diff --git a/configs/vicuna-stop.txt b/configs/vicuna-stop.txt
deleted file mode 100644
index 911d067ef..000000000
--- a/configs/vicuna-stop.txt
+++ /dev/null
@@ -1,8 +0,0 @@
---interactive-first
---keep -1
---ins-prefix-bos
---ins-prefix "\n### Human: "
---ins-suffix "\n### Assistant: "
---reverse-prompt "### Human: "
---stop-prompt "### Assistant: "
---rm-trailing-space-workaround
diff --git a/configs/vicuna.txt b/configs/vicuna.txt
deleted file mode 100644
index 6d811410a..000000000
--- a/configs/vicuna.txt
+++ /dev/null
@@ -1,9 +0,0 @@
---interactive-first
---keep -1
---ins-prefix-bos
---ins-prefix "\n### Human: "
---ins-suffix "\n### Assistant: "
---reverse-prompt "### Human: "
---rm-trailing-space-workaround
-
--p "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
diff --git a/examples/common.cpp b/examples/common.cpp
index eaa5aceea..0772dbfe1 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -2,13 +2,10 @@
 
 #include <cassert>
 #include <cstring>
-#include <iostream>
 #include <fstream>
-#include <sstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
-#include <regex>
 
 #if defined (_WIN32)
 #include <fcntl.h>
@@ -26,43 +23,6 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int
 #define CP_UTF8 65001
 #endif
 
-void split_args(const std::string & args_string, std::vector<std::string> & output_args)
-{
-    std::string current_arg = "";
-    bool in_quotes = false;
-    char quote_type;
-
-    for (char c : args_string) {
-        if (c == '"' || c == '\'') {
-            if (!in_quotes) {
-                in_quotes = true;
-                quote_type = c;
-            } else if (quote_type == c) {
-                in_quotes = false;
-            } else {
-                current_arg += c;
-            }
-        } else if (in_quotes) {
-            current_arg += c;
-        } else if (std::isspace(c)) {
-            if (current_arg != "") {
-                output_args.push_back(current_arg);
-                current_arg = "";
-            }
-        } else {
-            current_arg += c;
-        }
-    }
-
-    if (current_arg != "") {
-        output_args.push_back(current_arg);
-    }
-}
-
-std::string unescape(const std::string & str) {
-    return std::regex_replace(str, std::regex("\\\\n"), "\n");
-}
-
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     // determine sensible default number of threads.
     // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
@@ -80,11 +40,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     std::string arg;
     gpt_params default_params;
 
-    // get additional arguments from config files
-    std::vector<std::string> args;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
-        if (arg == "--config") {
+
+        if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seed = std::stoi(argv[i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prompt = argv[i];
+        } else if (arg == "-f" || arg == "--file") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -95,153 +72,85 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            std::string args_string;
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(args_string));
-            if (args_string.back() == '\n') {
-                args_string.pop_back();
-            }
-            split_args(args_string, args);
-            for (int j = 0; j < args.size(); j++) {
-                args[j] = unescape(args[j]);
-            }
-        } else {
-            args.emplace_back(argv[i]);
-        }
-    }
-
-    // parse args
-    int args_c = static_cast<int>(args.size());
-    for (int i = 0; i < args_c && !invalid_param; i++) {
-        arg = args[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoi(args[i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(args[i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = args[i];
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(args[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", args[i].c_str());
-                invalid_param = true;
-                break;
-            }
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
             if (params.prompt.back() == '\n') {
                 params.prompt.pop_back();
             }
         } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_predict = std::stoi(args[i]);
+            params.n_predict = std::stoi(argv[i]);
         } else if (arg == "--top_k") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.top_k = std::stoi(args[i]);
+            params.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_ctx = std::stoi(args[i]);
+            params.n_ctx = std::stoi(argv[i]);
         } else if (arg == "--memory_f32") {
             params.memory_f16 = false;
         } else if (arg == "--top_p") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.top_p = std::stof(args[i]);
+            params.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.temp = std::stof(args[i]);
+            params.temp = std::stof(argv[i]);
         } else if (arg == "--repeat_last_n") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_last_n = std::stoi(args[i]);
+            params.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat_penalty") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.repeat_penalty = std::stof(args[i]);
+            params.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_batch = std::stoi(args[i]);
+            params.n_batch = std::stoi(argv[i]);
             params.n_batch = std::min(512, params.n_batch);
         } else if (arg == "--keep") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_keep = std::stoi(args[i]);
+            params.n_keep = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.model = args[i];
+            params.model = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--embedding") {
             params.embedding = true;
-        } else if (arg == "--clean-interface") {
-            params.clean_interface = true;
         } else if (arg == "--interactive-start") {
             params.interactive = true;
         } else if (arg == "--interactive-first") {
             params.interactive_start = true;
         } else if (arg == "-ins" || arg == "--instruct") {
-            fprintf(stderr, "\n\nWarning: instruct mode is deprecated! Use: \n"
-                "--clean-interface "
-                "--interactive-first "
-                "--keep -1 "
-                "--ins-prefix-bos "
-                "--ins-prefix \"\\n\\n### Instruction:\\n\\n\" "
-                "--ins-suffix \"\\n\\n### Response:\\n\\n\" "
-                "-r \"### Instruction:\\n\\n\" "
-            "\n\n");
-            // params.instruct = true;
-            params.clean_interface = true;
-            params.interactive_start = true;
-            params.n_keep = -1;
-            params.instruct_prefix_bos = true;
-            params.instruct_prefix = "\n\n### Instruction:\n\n";
-            params.instruct_suffix = "\n\n### Response:\n\n";
-            params.antiprompt.push_back("### Instruction:\n\n");
+            params.instruct = true;
         } else if (arg == "--color") {
             params.use_color = true;
-        } else if (arg == "--disable-multiline") {
-            params.multiline_mode = false;
         } else if (arg == "--mlock") {
             params.use_mlock = true;
         } else if (arg == "--no-mmap") {
@@ -251,94 +160,65 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.antiprompt.push_back(args[i]);
-        } else if (arg == "--stop-prompt") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.stopprompt.push_back(args[i]);
-        } else if (arg == "--rm-trailing-space-workaround") {
-            params.rm_trailing_space_workaround = true;
+            params.antiprompt.push_back(argv[i]);
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--n_parts") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.n_parts = std::stoi(args[i]);
+            params.n_parts = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argv[0], default_params);
+            gpt_print_usage(argc, argv, default_params);
             exit(0);
         } else if (arg == "--random-prompt") {
             params.random_prompt = true;
         } else if (arg == "--in-prefix") {
-            if (++i >= args_c) {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            params.input_prefix = args[i];
-        } else if (arg == "--ins-prefix-bos") {
-            params.instruct_prefix_bos = true;
-        } else if (arg == "--ins-prefix") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.instruct_prefix = args[i];
-        } else if (arg == "--ins-suffix-bos") {
-            params.instruct_suffix_bos = true;
-        } else if (arg == "--ins-suffix") {
-            if (++i >= args_c) {
-                invalid_param = true;
-                break;
-            }
-            params.instruct_suffix = args[i];
+            params.input_prefix = argv[i];
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argv[0], default_params);
+            gpt_print_usage(argc, argv, default_params);
             exit(1);
         }
     }
     if (invalid_param) {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argv[0], default_params);
+        gpt_print_usage(argc, argv, default_params);
         exit(1);
     }
 
     return true;
 }
 
-void gpt_print_usage(char * argv_0, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv_0);
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
     fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
     fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(stderr, "  --clean-interface     hides input prefix & suffix and displays '>' instead\n");
+    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
     fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
     fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
     fprintf(stderr, "                        specified more than once for multiple prompts).\n");
     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
-    fprintf(stderr, "  --disable-multiline   disable multiline mode (use Ctrl+D on Linux/Mac and Ctrl+Z then Return on Windows to toggle multiline)\n");
     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
     fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
     fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    fprintf(stderr, "  --ins-prefix STRING   (instruct) prefix user inputs with tokenized string (default: empty)\n");
-    fprintf(stderr, "  --ins-prefix-bos      (instruct) prepend bos token to instruct prefix.\n");
-    fprintf(stderr, "  --ins-suffix STRING   (instruct) suffix user inputs with tokenized string (default: empty)\n");
-    fprintf(stderr, "  --ins-suffix-bos      (instruct) prepend bos token to instruct suffix.\n");
     fprintf(stderr, "  -f FNAME, --file FNAME\n");
     fprintf(stderr, "                        prompt file to start generation.\n");
     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
@@ -448,61 +328,3 @@ void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
     str = strTo;
 }
 #endif
-
-bool get_input_text(std::string & input_text, bool eof_toggled_multiline_mode) {
-    bool another_line = true;
-    bool is_eof_multiline_toggled = false;
-    do {
-        std::string line;
-#if defined(_WIN32)
-        auto & stdcin = std::wcin;
-        std::wstring wline;
-        if (!std::getline(stdcin, wline)) {
-            // input stream is bad or EOF received
-            if (stdcin.bad()) {
-                fprintf(stderr, "%s: error: input stream bad\n", __func__);
-                return 1;
-            }
-        }
-        win32_utf8_encode(wline, line);
-#else
-        auto & stdcin = std::cin;
-        if (!std::getline(stdcin, line)) {
-            // input stream is bad or EOF received
-            if (stdcin.bad()) {
-                fprintf(stderr, "%s: error: input stream bad\n", __func__);
-                return 1;
-            }
-        }
-#endif
-        if (stdcin.eof()) {
-            stdcin.clear();
-            stdcin.seekg(0, std::ios::beg);
-            if (!eof_toggled_multiline_mode) {
-                another_line = false;
-            } else {
-                is_eof_multiline_toggled = !is_eof_multiline_toggled;
-                if (is_eof_multiline_toggled) {
-                    input_text += line;
-                    continue;
-                }
-            }
-        }
-        if (!eof_toggled_multiline_mode) {
-            if (line.empty() || line.back() != '\\') {
-                another_line = false;
-            } else {
-                line.pop_back(); // Remove the continue character
-            }
-        } else {
-            if (!is_eof_multiline_toggled) {
-                another_line = false;
-            }
-        }
-        input_text += line;
-        if (another_line) {
-            input_text += '\n'; // Append the line to the result
-        }
-    } while (another_line);
-    return true;
-}
diff --git a/examples/common.h b/examples/common.h
index df8e4c6cc..1ea6f7445 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -14,14 +14,14 @@
 //
 
 struct gpt_params {
-    int32_t seed          = -1;    // RNG seed
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency()); // max 4 threads (default)
-    int32_t n_predict     = 128;   // new tokens to predict
-    int32_t repeat_last_n = 64;    // last n tokens to penalize
-    int32_t n_parts       = -1;    // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512;   // context size
-    int32_t n_batch       = 8;     // batch size for prompt processing
-    int32_t n_keep        = 0;     // number of tokens to keep from initial prompt (-1 for all)
+    int32_t seed          = -1;   // RNG seed
+    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict     = 128;  // new tokens to predict
+    int32_t repeat_last_n = 64;   // last n tokens to penalize
+    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
+    int32_t n_ctx         = 512;  // context size
+    int32_t n_batch       = 8;    // batch size for prompt processing
+    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters
     int32_t top_k = 40;
@@ -33,15 +33,8 @@ struct gpt_params {
     std::string prompt = "";
     std::string input_prefix = ""; // string to prefix user inputs with
 
-    std::string instruct_prefix = ""; // prefix user inputs with tokenized string
-    bool instruct_prefix_bos = false; // prepend bos token to instruct prefix
-    std::string instruct_suffix = ""; // suffix user inputs with tokenized string
-    bool instruct_suffix_bos = false; // prepend bos token to instruct suffix
 
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::vector<std::string> stopprompt; // string upon seeing which more user input is prompted (without adding instruct prefixes and suffixes)
-
-    bool rm_trailing_space_workaround = false; // workaround for removing trailing space from reverse/stop prompts
 
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
@@ -58,14 +51,11 @@ struct gpt_params {
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
     bool verbose_prompt    = false; // print prompt tokens before generation
-
-    bool clean_interface   = false; // hides input prefix & suffix and displays '>'
-    bool multiline_mode    = true; // enables multi-line mode, to send input press CTRL+D on Linux/Max, Ctrl+Z then Return on Windows
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 
-void gpt_print_usage(char * argv_0, const gpt_params & params);
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 
 std::string gpt_random_prompt(std::mt19937 & rng);
 
@@ -105,5 +95,3 @@ void set_console_color(console_state & con_st, console_color_t color);
 void win32_console_init(bool enable_color);
 void win32_utf8_encode(const std::wstring & wstr, std::string & str);
 #endif
-
-bool get_input_text(std::string & input_text, bool escape_newline_mode);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 68b4b2840..ba153cb82 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -30,8 +30,7 @@ static bool is_interacting = false;
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
     set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    fflush(stdout);
-    fflush(stderr);
+    printf("\n"); // this also force flush stdout.
     if (signo == SIGINT) {
         if (!is_interacting) {
             is_interacting=true;
@@ -90,8 +89,6 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    bool instruct_mode = !params.instruct_prefix.empty() || !params.instruct_suffix.empty();
-
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";
 
@@ -156,20 +153,22 @@ int main(int argc, char ** argv) {
     }
 
     // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size()) {
+    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
         params.n_keep = (int)embd_inp.size();
     }
 
     // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, params.instruct_prefix, params.instruct_prefix_bos);
-    std::string instruct_suffix = params.instruct_suffix;
-    if (params.rm_trailing_space_workaround) {
-        if (instruct_suffix.back() == ' ') { instruct_suffix.pop_back(); }
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+
+    // in instruct mode, we inject a prefix and a suffix to each input by the user
+    if (params.instruct) {
+        params.interactive_start = true;
+        params.antiprompt.push_back("### Instruction:\n\n");
     }
-    const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos);
 
     // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) {
+    if (params.antiprompt.size() != 0 || params.interactive_start) {
         params.interactive = true;
     }
 
@@ -211,21 +210,10 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
             }
         }
-        if (params.stopprompt.size()) {
-            for (auto stopprompt : params.stopprompt) {
-                fprintf(stderr, "Stop prompt: '%s'\n", stopprompt.c_str());
-            }
-        }
 
         if (!params.input_prefix.empty()) {
             fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
         }
-        if (!params.instruct_prefix.empty()) {
-            fprintf(stderr, "Instruct prefix %s: '%s'\n", params.instruct_prefix_bos ? "(with bos token)" : "", params.instruct_prefix.c_str());
-        }
-        if (!params.instruct_suffix.empty()) {
-            fprintf(stderr, "Instruct suffix %s: '%s'\n", params.instruct_suffix_bos ? "(with bos token)" : "", params.instruct_suffix.c_str());
-        }
     }
     fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
         params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
@@ -241,29 +229,12 @@ int main(int argc, char ** argv) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
                " - Press Ctrl+C to interject at any time.\n"
 #endif
-        );
-        if (params.multiline_mode) {
-            fprintf(stderr, " - Press Return to return control to LLaMa.\n"
-#if defined (_WIN32)
-                            " - [MULTILINE MODE] Press Ctrl+Z then Return (EOF) to toggle.\n\n");
-#else
-                            " - [MULTILINE MODE] Press Ctrl+D (EOF) to toggle.\n\n");
-#endif
-        }
-        else {
-            fprintf(stderr, " - Press Return to return control to LLaMa.\n"
-                            " - If you want to submit another line, end your input in '\\'.\n\n");
-        }
+               " - Press Return to return control to LLaMa.\n"
+               " - If you want to submit another line, end your input in '\\'.\n\n");
         is_interacting = params.interactive_start;
     }
 
-    struct Antiprompt {
-        bool any = false;
-        bool trailing_space = false;
-        size_t len;
-        bool is_stop_prompt = false;
-    } antiprompt;
-
+    bool is_antiprompt = false;
     bool input_noecho  = false;
 
     int n_past     = 0;
@@ -333,7 +304,7 @@ int main(int argc, char ** argv) {
             }
 
             // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !instruct_mode) {
+            if (id == llama_token_eos() && params.interactive && !params.instruct) {
                 id = llama_token_newline.front();
                 if (params.antiprompt.size() != 0) {
                     // tokenize and inject first reverse prompt
@@ -379,72 +350,27 @@ int main(int argc, char ** argv) {
         // check if we should prompt the user for more
         if (params.interactive && (int) embd_inp.size() <= n_consumed) {
 
-            // check for reverse prompt or stop prompt
-            if (params.antiprompt.size() || params.stopprompt.size()) {
+            // check for reverse prompt
+            if (params.antiprompt.size()) {
                 std::string last_output;
                 for (auto id : last_n_tokens) {
                     last_output += llama_token_to_str(ctx, id);
                 }
 
-                antiprompt.any = false;
-                antiprompt.is_stop_prompt = false;
+                is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
-                for (std::string & prompt : params.antiprompt) {
-                    if (params.rm_trailing_space_workaround) {
-                        antiprompt.trailing_space = prompt.back() == ' ';
-                        antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
-                    }
-                    if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
+                for (std::string & antiprompt : params.antiprompt) {
+                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                         is_interacting = true;
-                        antiprompt.any = true;
+                        is_antiprompt = true;
                         set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
                         fflush(stdout);
                         break;
                     }
                 }
-                if (!antiprompt.any) {
-                    for (std::string & prompt : params.stopprompt) {
-                        if (params.rm_trailing_space_workaround) {
-                            antiprompt.trailing_space = prompt.back() == ' ';
-                            antiprompt.len = prompt.length() - (antiprompt.trailing_space ? 1 : 0);
-                        }
-                        if (last_output.find(prompt.c_str(), last_output.length() - antiprompt.len, antiprompt.len) != std::string::npos) {
-                            is_interacting = true;
-                            antiprompt.any = true;
-                            antiprompt.is_stop_prompt = true;
-                            set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-                            fflush(stdout);
-                            break;
-                        }
-                    }
-                }
             }
 
-            if (n_past > 0 && is_interacting)
-            {
-                std::string buffer;
-                if (!params.clean_interface && !params.instruct_prefix.empty() && !antiprompt.any) {
-                    // avoid printing again user's new line (TODO: try to revert enter press and print newline)
-                    int i = params.instruct_prefix.front() == '\n' ? 1 : 0;
-                    for (; i < inp_pfx.size(); i++) {
-                        printf("%s", llama_token_to_str(ctx, inp_pfx[i]));
-                    }
-                    fflush(stdout);
-                }
-                if (params.rm_trailing_space_workaround) {
-                    // add only if not stopprompt (as stopprompt could be used to pause
-                        //     assistant and then continue without input - adding back trailing
-                        //     space may mess it up.)
-                    if (!antiprompt.is_stop_prompt && antiprompt.any && antiprompt.trailing_space) {
-                        // add back removed trailing space to buffer(workaround)
-                        buffer += ' ';
-                        if (!params.clean_interface) {
-                            printf("%s", buffer.c_str());
-                        }
-                        fflush(stdout);
-                    }
-                }
-
+            if (n_past > 0 && is_interacting) {
                 // potentially set color to indicate we are taking user input
                 set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
 
@@ -453,45 +379,49 @@ int main(int argc, char ** argv) {
                 signal(SIGINT, sigint_handler);
 #endif
 
-                if (params.clean_interface) {
+                if (params.instruct) {
                     printf("\n> ");
                 }
 
+                std::string buffer;
                 if (!params.input_prefix.empty()) {
                     buffer += params.input_prefix;
                     printf("%s", buffer.c_str());
                 }
 
-                if (!get_input_text(buffer, params.multiline_mode)) {
-                    // input stream is bad
-                    return 1;
-                }
-                if (!antiprompt.is_stop_prompt) {
-                    buffer += "\n";
-                }
+                std::string line;
+                bool another_line = true;
+                do {
+#if defined(_WIN32)
+                    std::wstring wline;
+                    if (!std::getline(std::wcin, wline)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
+                    win32_utf8_encode(wline, line);
+#else
+                    if (!std::getline(std::cin, line)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
+#endif
+                    if (line.empty() || line.back() != '\\') {
+                        another_line = false;
+                    } else {
+                        line.pop_back(); // Remove the continue character
+                    }
+                    buffer += line + '\n'; // Append the line to the result
+                } while (another_line);
 
                 // done taking input, reset color
                 set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
 
-                if (!params.clean_interface && !params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
-                    // avoid printing again user's new line (TODO: try to revert enter press and print newline)
-                    int i = params.instruct_suffix.front() == '\n' ? 1 : 0;
-                    for (; i < inp_sfx.size(); i++) {
-                        printf("%s", llama_token_to_str(ctx, inp_sfx[i]));
-                    }
-                    // if (remove trailing space workaround) {
-                    //     We won't add back removed trailing space here, because assistant continues here,
-                    //         and it may mess up it's output (remove trailing space workaround).
-                    // }
-                    fflush(stdout);
-                }
-
                 // Add tokens to embd only if the input buffer is non-empty
                 // Entering a empty line lets the user pass control back
                 if (buffer.length() > 1) {
 
-                    // insert input prefix
-                    if (!params.instruct_prefix.empty() && !antiprompt.any) {
+                    // instruct mode: insert instruction prefix
+                    if (params.instruct && !is_antiprompt) {
                         n_consumed = embd_inp.size();
                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                     }
@@ -499,8 +429,8 @@ int main(int argc, char ** argv) {
                     auto line_inp = ::llama_tokenize(ctx, buffer, false);
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
-                    // insert response suffix
-                    if (!params.instruct_suffix.empty() && !antiprompt.is_stop_prompt) {
+                    // instruct mode: insert response suffix
+                    if (params.instruct) {
                         embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                     }
 
@@ -517,7 +447,7 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (!embd.empty() && embd.back() == llama_token_eos()) {
-            if (instruct_mode) {
+            if (params.instruct) {
                 is_interacting = true;
             } else {
                 fprintf(stderr, " [end of text]\n");
diff --git a/prompts/alpaca.txt b/prompts/alpaca.txt
new file mode 100644
index 000000000..2224bdeb0
--- /dev/null
+++ b/prompts/alpaca.txt
@@ -0,0 +1 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
diff --git a/prompts/chat-with-bob.txt b/prompts/chat-with-bob.txt
new file mode 100644
index 000000000..ad494d831
--- /dev/null
+++ b/prompts/chat-with-bob.txt
@@ -0,0 +1,7 @@
+Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, Bob.
+Bob: Hello. How may I help you today?
+User: Please tell me the largest city in Europe.
+Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
+User:
\ No newline at end of file

From 106faaf2971d6c89d6010279a9a95737772470ef Mon Sep 17 00:00:00 2001
From: katsu560 <118887472+katsu560@users.noreply.github.com>
Date: Sat, 15 Apr 2023 14:51:11 +0900
Subject: [PATCH 32/34] cmake : add finding the OpenBLAS header file (#992)

---
 CMakeLists.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5715d92a..5a20de3a2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,6 +120,21 @@ if (LLAMA_OPENBLAS)
         add_compile_definitions(GGML_USE_OPENBLAS)
         add_link_options(${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
+
+        # find header file
+        set(OPENBLAS_INCLUDE_SEARCH_PATHS
+            /usr/include
+            /usr/include/openblas
+            /usr/include/openblas-base
+            /usr/local/include
+            /usr/local/include/openblas
+            /usr/local/include/openblas-base
+            /opt/OpenBLAS/include
+            $ENV{OpenBLAS_HOME}
+            $ENV{OpenBLAS_HOME}/include
+            )
+        find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+        add_compile_options(-I${OPENBLAS_INC})
     else()
         message(WARNING "OpenBLAS not found")
     endif()

From c12b14b77fced0ce9a0e2d81f670c3a746dec251 Mon Sep 17 00:00:00 2001
From: Ivan Komarov <Ivan.Komarov@dfyz.info>
Date: Sat, 15 Apr 2023 07:51:54 +0200
Subject: [PATCH 33/34] benchmark : fix result validation in
 benchmark-q4_0-matmult (#987)

---
 examples/benchmark/benchmark-q4_0-matmult.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-q4_0-matmult.c
index 90f537fd8..84b06766c 100644
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@@ -24,7 +24,7 @@
 
 float tensor_sum_elements(struct ggml_tensor * tensor) {
     float sum = 0;
-    if (tensor->type==6) {
+    if (tensor->type==GGML_TYPE_F32) {
         for (int j = 0; j < tensor->ne[1]; j++) {
             for (int k = 0; k < tensor->ne[0]; k++) {
                 sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];

From aa485cee334e84437e21681c14b6f80b65876d8b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 15 Apr 2023 14:25:45 +0300
Subject: [PATCH 34/34] ggml : use posix_memalign on non-Windows env

---
 ggml.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1574d6498..cf6a81f43 100644
--- a/ggml.c
+++ b/ggml.c
@@ -118,7 +118,16 @@ typedef void* thread_ret_t;
 #define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
 #else
-#define GGML_ALIGNED_MALLOC(size)  aligned_alloc(GGML_MEM_ALIGN, size)
+inline static void* ggml_aligned_malloc(size_t size) {
+    void* aligned_memory = NULL;
+    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
+    if (result != 0) {
+        // Handle allocation failure
+        return NULL;
+    }
+    return aligned_memory;
+}
+#define GGML_ALIGNED_MALLOC(size)  ggml_aligned_malloc(size)
 #define GGML_ALIGNED_FREE(ptr)     free(ptr)
 #endif
 
@@ -531,31 +540,31 @@ inline static float vaddvq_f32(float32x4_t v) {
     return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 }
 
-inline float vminvq_f32(float32x4_t v) {
+float vminvq_f32(float32x4_t v) {
     return
         MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
             MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 
-inline float vmaxvq_f32(float32x4_t v) {
+float vmaxvq_f32(float32x4_t v) {
     return
         MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
             MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 
-inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
+int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
     return vget_low_s8(vcombine_s8(a, b));
 }
 
-inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
+int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
     return vget_high_s8(vcombine_s8(a, b));
 }
 
-inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
     return vget_low_u8(vcombine_u8(a, b));
 }
 
-inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
     return vget_high_u8(vcombine_u8(a, b));
 }