Merge branch 'master' into custom-attention-mask

2023-09-28 15:19:57 +03:00 · 2023-09-28 15:19:57 +03:00 · 25856900db
commit 25856900db
parent c1596f633f 4aea3b846e
36 changed files with 730 additions and 239 deletions
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "ggml.h"

@ -20,7 +21,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

    if (plan.work_size > 0) {
@ -31,19 +32,19 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
    ggml_graph_compute(graph, &plan);
 }

-float tensor_sum_elements(const ggml_tensor * tensor) {
-    float sum = 0;
-    if (tensor->type==GGML_TYPE_F32) {
+static float tensor_sum_elements(const ggml_tensor * tensor) {
+    double sum = 0;
+    if (tensor->type == GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
            for (int k = 0; k < tensor->ne[0]; k++) {
-                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
+                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
            }
        }
    }
    return sum;
 }

-void tensor_dump(const ggml_tensor * tensor, const char * name) {
+static void tensor_dump(const ggml_tensor * tensor, const char * name) {
    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
        tensor->type, ggml_type_name(tensor->type),
        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
@ -58,7 +59,7 @@ struct benchmark_params_struct {
    int32_t n_iterations  = 10;
 };

-void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
@ -125,12 +126,15 @@ int main(int argc, char ** argv)  {

    //printf("Memsize required = %i\n", sizex*sizex);

+    // TODO: perform the bench for all types or for a user specified type
+    const ggml_type qtype = GGML_TYPE_Q4_1;
+
    size_t ctx_size = 0;
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += 1024*1024*16;
@ -163,7 +167,7 @@ int main(int argc, char ** argv)  {
    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
    ggml_set_f32(m2, 2.0f);

-    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    printf("\n------ Test 1 - Matrix Mult via F32 code\n");
    // printf("Creating new tensor m11xm2\n");
    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);

@ -181,17 +185,16 @@ int main(int argc, char ** argv)  {

    TENSOR_DUMP(gf.nodes[0]);

-    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));

    int32_t nelements = sizex*sizey;
-    int32_t ne[2] = { sizex, sizey };

    std::vector<int64_t> hist_cur(1 << 4, 0);

    // Set up a the benchmark matrices
    // printf("Creating new tensor q11 & Running quantize\n");
-    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
-    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());

    // Set up a the compute graph
    // printf("Creating new tensor q31\n");
@ -202,8 +205,8 @@ int main(int argc, char ** argv)  {

    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
-    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
-    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());

    // printf("Creating new tensor q32\n");
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@ -220,7 +223,7 @@ int main(int argc, char ** argv)  {
    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);


-    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    // Let's use the F32 result from above as a reference for the quantized multiplication
    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);

    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
@ -250,7 +253,7 @@ int main(int argc, char ** argv)  {
        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
-        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6

        if (delta > allowed_delta)  {
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "embd-input.h"

--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -1,3 +1,21 @@
-# embedding
+# llama.cpp/example/embedding

-TODO
+This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+```
+
+### Windows:
+
+```powershell
+embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+```
+
+The above command will output space-separated float values.
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
@ -367,10 +367,10 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
+        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
@ -380,10 +380,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { printf("%s: model file type      = %" PRIu32 "\n", __func__, gguf_get_val_u32(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
-        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
+        keyidx = gguf_find_key(ggufctx, "general.source.huggingface.repository");
        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -0,0 +1,271 @@
+# llama.cpp/example/llama-bench
+
+Performance testing tool for llama.cpp.
+
+## Table of contents
+
+1. [Syntax](#syntax)
+2. [Examples](#examples)
+    1. [Text generation with different models](#text-generation-with-different-models)
+    2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes)
+    3. [Different numbers of threads](#different-numbers-of-threads)
+    4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu)
+3. [Output formats](#output-formats)
+    1. [Markdown](#markdown)
+    2. [CSV](#csv)
+    3. [JSON](#json)
+    4. [SQL](#sql)
+
+## Syntax
+
+```
+usage: ./llama-bench [options]
+
+options:
+  -h, --help
+  -m, --model <filename>            (default: models/7B/ggml-model-q4_0.gguf)
+  -p, --n-prompt <n>                (default: 512)
+  -n, --n-gen <n>                   (default: 128)
+  -b, --batch-size <n>              (default: 512)
+  --memory-f32 <0|1>                (default: 0)
+  -t, --threads <n>                 (default: 16)
+  -ngl N, --n-gpu-layers <n>        (default: 99)
+  -mg i, --main-gpu <i>             (default: 0)
+  -mmq, --mul-mat-q <0|1>           (default: 1)
+  -ts, --tensor_split <ts0/ts1/..>
+  -r, --repetitions <n>             (default: 5)
+  -o, --output <csv|json|md|sql>    (default: md)
+  -v, --verbose                     (default: 0)
+
+Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
+```
+
+llama-bench can perform two types of tests:
+
+- Prompt processing (pp): processing a prompt in batches (`-p`)
+- Text generation (tg): generating a sequence of tokens (`-n`)
+
+With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
+
+Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
+
+For a description of the other options, see the [main example](../main/README.md).
+
+## Examples
+
+### Text generation with different models
+
+```sh
+$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    132.19 ± 0.55 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 256     |    129.37 ± 0.54 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 512     |    123.83 ± 0.25 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 128     |     82.17 ± 0.31 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 256     |     80.74 ± 0.23 |
+| llama 13B mostly Q4_0          |   6.86 GiB |    13.02 B | CUDA       |  99 | tg 512     |     78.08 ± 0.07 |
+
+### Prompt processing with different batch sizes
+
+```sh
+$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024
+```
+
+| model                          |       size |     params | backend    | ngl |    n_batch | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        128 | pp 1024    |   1436.51 ± 3.66 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        256 | pp 1024    |  1932.43 ± 23.48 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |        512 | pp 1024    |  2254.45 ± 15.59 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 |       1024 | pp 1024    |  2498.61 ± 13.58 |
+
+### Different numbers of threads
+
+```sh
+$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32
+```
+
+| model                          |       size |     params | backend    |    threads | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | pp 64      |      6.17 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          1 | tg 16      |      4.05 ± 0.02 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | pp 64      |     12.31 ± 0.13 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          2 | tg 16      |      7.80 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | pp 64      |     23.18 ± 0.06 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          4 | tg 16      |     12.22 ± 0.07 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | pp 64      |     32.29 ± 1.21 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |          8 | tg 16      |     16.71 ± 0.66 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | pp 64      |     33.52 ± 0.03 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         16 | tg 16      |     15.32 ± 0.05 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | pp 64      |     59.00 ± 1.11 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CPU        |         32 | tg 16      |     16.41 ± 0.79 ||
+
+### Different numbers of layers offloaded to the GPU
+
+```sh
+$ ./llama-bench -ngl 10,20,30,31,32,33,34,35
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | pp 512     |    373.36 ± 2.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  10 | tg 128     |     13.45 ± 0.93 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | pp 512     |    472.65 ± 1.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  20 | tg 128     |     21.36 ± 1.94 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | pp 512     |   631.87 ± 11.25 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  30 | tg 128     |     40.04 ± 1.82 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | pp 512     |    657.89 ± 5.08 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  31 | tg 128     |     48.19 ± 0.81 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | pp 512     |    688.26 ± 3.29 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  32 | tg 128     |     54.78 ± 0.65 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | pp 512     |    704.27 ± 2.24 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  33 | tg 128     |     60.62 ± 1.76 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | pp 512     |    881.34 ± 5.40 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  34 | tg 128     |     71.76 ± 0.23 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
+
+## Output formats
+
+By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
+
+### Markdown
+
+```sh
+$ ./llama-bench -o md
+```
+
+| model                          |       size |     params | backend    | ngl | test       |              t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | pp 512     |  2368.80 ± 93.24 |
+| llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  99 | tg 128     |    131.42 ± 0.59 |
+
+### CSV
+
+```sh
+$ ./llama-bench -o csv
+```
+
+```csv
+build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
+"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
+```
+
+### JSON
+
+```sh
+$ ./llama-bench -o json
+```
+
+```json
+[
+  {
+    "build_commit": "3469684",
+    "build_number": 1275,
+    "cuda": true,
+    "opencl": false,
+    "metal": false,
+    "gpu_blas": true,
+    "blas": true,
+    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
+    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
+    "model_filename": "models/7B/ggml-model-q4_0.gguf",
+    "model_type": "llama 7B mostly Q4_0",
+    "model_size": 3825065984,
+    "model_n_params": 6738415616,
+    "n_batch": 512,
+    "n_threads": 16,
+    "f16_kv": true,
+    "n_gpu_layers": 99,
+    "main_gpu": 0,
+    "mul_mat_q": true,
+    "tensor_split": "0.00",
+    "n_prompt": 512,
+    "n_gen": 0,
+    "test_time": "2023-09-23T12:09:57Z",
+    "avg_ns": 212365953,
+    "stddev_ns": 985423,
+    "avg_ts": 2410.974041,
+    "stddev_ts": 11.163766,
+    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
+    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
+  },
+  {
+    "build_commit": "3469684",
+    "build_number": 1275,
+    "cuda": true,
+    "opencl": false,
+    "metal": false,
+    "gpu_blas": true,
+    "blas": true,
+    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
+    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
+    "model_filename": "models/7B/ggml-model-q4_0.gguf",
+    "model_type": "llama 7B mostly Q4_0",
+    "model_size": 3825065984,
+    "model_n_params": 6738415616,
+    "n_batch": 512,
+    "n_threads": 16,
+    "f16_kv": true,
+    "n_gpu_layers": 99,
+    "main_gpu": 0,
+    "mul_mat_q": true,
+    "tensor_split": "0.00",
+    "n_prompt": 0,
+    "n_gen": 128,
+    "test_time": "2023-09-23T12:09:59Z",
+    "avg_ns": 977425219,
+    "stddev_ns": 9268593,
+    "avg_ts": 130.965708,
+    "stddev_ts": 1.238924,
+    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
+    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
+  }
+]
+```
+
+### SQL
+
+SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
+
+```sh
+$ ./llama-bench -o sql
+```
+
+```sql
+CREATE TABLE IF NOT EXISTS test (
+  build_commit TEXT,
+  build_number INTEGER,
+  cuda INTEGER,
+  opencl INTEGER,
+  metal INTEGER,
+  gpu_blas INTEGER,
+  blas INTEGER,
+  cpu_info TEXT,
+  gpu_info TEXT,
+  model_filename TEXT,
+  model_type TEXT,
+  model_size INTEGER,
+  model_n_params INTEGER,
+  n_batch INTEGER,
+  n_threads INTEGER,
+  f16_kv INTEGER,
+  n_gpu_layers INTEGER,
+  main_gpu INTEGER,
+  mul_mat_q INTEGER,
+  tensor_split TEXT,
+  n_prompt INTEGER,
+  n_gen INTEGER,
+  test_time TEXT,
+  avg_ns INTEGER,
+  stddev_ns INTEGER,
+  avg_ts REAL,
+  stddev_ts REAL
+);
+
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+```
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -903,7 +903,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
    }
 }

-static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
+static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) text;
    (void) user_data;
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@ -1,22 +1,25 @@
 #!/usr/bin/env python3
 """
-This script converts Hugging Face llama models to GGML and quantizes them.
+This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.

 Usage:
-python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
+python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]

 Arguments:
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
+- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
+- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
 - --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
 - --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
 - --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
 - --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.

-Quant types:
+Old quant types (some base model types require these):
 - Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
 - Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
 - Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
 - Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
+
+New quant types (recommended):
 - Q2_K: smallest, extreme quality loss - not recommended
 - Q3_K: alias for Q3_K_M
 - Q3_K_S: very small, very high quality loss
@ -40,9 +43,7 @@ import argparse
 import os
 from huggingface_hub import snapshot_download

-def main(model, outname, outdir, quants, keep_fp16):
-    ggml_version = "v3"
-
+def main(model, model_type, outname, outdir, quants, keep_fp16):
    if not os.path.isdir(model):
        print(f"Model not found at {model}. Downloading...")
        try:
@ -63,17 +64,20 @@ def main(model, outname, outdir, quants, keep_fp16):
    print("Building llama.cpp")
    subprocess.run(f"cd .. && make quantize", shell=True, check=True)

-    fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
+    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"

-    print(f"Making unquantised GGML at {fp16}")
+    print(f"Making unquantised GGUF at {fp16}")
    if not os.path.isfile(fp16):
-        subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
+        if model_type != "llama":
+            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
+        else:
+            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
    else:
        print(f"Unquantised GGML already exists at: {fp16}")

    print("Making quants")
    for type in quants:
-        outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
+        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
        print(f"Making {type} : {outfile}")
        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)

@ -81,8 +85,9 @@ def main(model, outname, outdir, quants, keep_fp16):
        os.remove(fp16)

 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
-    parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
+    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
+    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
+    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
    parser.add_argument('--outname', default=None, help='Output model(s) name')
    parser.add_argument('--outdir', default=None, help='Output directory')
    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
@ -90,4 +95,4 @@ if __name__ == "__main__":

    args = parser.parse_args()

-    main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)
+    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@ -1,3 +1,21 @@
 # perplexity

 TODO
+
+## Llama 2 70B Scorechart
+Quantization | Model size (GiB) | Perplexity | Delta to fp16
+-- | -- | -- | --
+Q4_0 | 36.20 | 3.5550 | 3.61%
+Q4_1 | 40.20 | 3.5125 | 2.37%
+Q5_0 | 44.20 | 3.4744 | 1.26%
+Q2_K | 27.27 | 3.7339 | 8.82%
+Q3_K_S | 27.86 | 3.7019 | 7.89%
+Q3_K_M | 30.83 | 3.5932 | 4.72%
+Q3_K_L | 33.67 | 3.5617 | 3.80%
+Q4_K_S | 36.39 | 3.4852 | 1.57%
+Q4_K_M | 38.54 | 3.4725 | 1.20%
+Q5_K_S | 44.20 | 3.4483 | 0.50%
+Q5_K_M | 45.41 | 3.4451 | 0.40%
+Q6_K | 52.70 | 3.4367 | 0.16%
+fp16 | 128.5 | 3.4313 | -
+
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,4 +1,5 @@
 #define LLAMA_API_INTERNAL
+#include "build-info.h"
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -1,3 +1,44 @@
 # quantize

 TODO
+
+## Llama 2 7B
+
+Quantization | Bits per Weight (BPW)
+-- | --
+Q2_K | 3.35
+Q3_K_S | 3.50
+Q3_K_M | 3.91
+Q3_K_L | 4.27
+Q4_K_S | 4.58
+Q4_K_M | 4.84
+Q5_K_S | 5.52
+Q5_K_M | 5.68
+Q6_K | 6.56
+
+## Llama 2 13B
+Quantization | Bits per Weight (BPW)
+-- | --
+Q2_K | 3.34
+Q3_K_S | 3.48
+Q3_K_M | 3.89
+Q3_K_L | 4.26
+Q4_K_S | 4.56
+Q4_K_M | 4.83
+Q5_K_S | 5.51
+Q5_K_M | 5.67
+Q6_K | 6.56
+
+# Llama 2 70B
+
+Quantization | Bits per Weight (BPW)
+-- | --
+Q2_K | 3.40
+Q3_K_S | 3.47
+Q3_K_M | 3.85
+Q3_K_L | 4.19
+Q4_K_S | 4.53
+Q4_K_M | 4.80
+Q5_K_S | 5.50
+Q5_K_M | 5.65
+Q6_K | 6.56
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");