Merge 'origin/master' into hipblas

2023-06-14 22:41:55 +03:00 · 2023-06-14 22:41:55 +03:00 · a836529996
commit a836529996
parent 85f902d5c4 254a7a7a5f
32 changed files with 7707 additions and 694 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -16,4 +16,6 @@ COPY . .
 RUN make
 ENV LC_ALL=C.utf8
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -15,4 +15,6 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/main /main
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/main" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -464,6 +464,9 @@ target_link_libraries(llama PRIVATE
 if (BUILD_SHARED_LIBS)
    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
    if (LLAMA_METAL)
        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
    endif()
 endif()
 if (GGML_SOURCES_CUDA)
--- a/5
+++ b/5
@ -107,6 +107,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	# Usage AVX-only
 	#CFLAGS   += -mfma -mf16c -mavx
 	#CXXFLAGS += -mfma -mf16c -mavx
 	# Usage SSSE3-only (Not is SSE3!)
 	#CFLAGS   += -mssse3
 	#CXXFLAGS += -mssse3
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
@ -123,6 +127,7 @@ endif
 ifndef LLAMA_NO_K_QUANTS
 	CFLAGS   += -DGGML_USE_K_QUANTS
 	CXXFLAGS += -DGGML_USE_K_QUANTS
 	OBJS     += k_quants.o
 endif
--- a/README.md
+++ b/README.md
@ -308,7 +308,7 @@ Building the program with BLAS support may lead to some performance improvements
 - #### BLIS
-  Check [BLIS.md](BLIS.md) for more information.
+  Check [BLIS.md](docs/BLIS.md) for more information.
 - #### Intel MKL
--- a/8
+++ b/8
@ -1,6 +1,6 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_0.bin
+ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf  models/7B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
@ -8,7 +8,7 @@ ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_0.bin
+fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5  models/13B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
@ -18,7 +18,7 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/con
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_0.bin
+d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d  models/30B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
@ -32,7 +32,7 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_0.bin
+cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92  models/65B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -37,6 +37,7 @@ else()
    add_subdirectory(save-load-state)
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -79,34 +79,39 @@ struct ggml_tensor * randomize_tensor_normal(
        int ndims,
        const int64_t ne[],
        struct random_normal_distribution * rnd) {
    float scale = 1.0; // xavier
    switch (ndims) {
        case 1:
            scale /= sqrtf(ne[0]);
            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)tensor->data)[i0] = frand_normal(rnd);
+                ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
            }
            break;
        case 2:
            scale /= sqrtf(ne[0]+ne[1]);
            for (int i1 = 0; i1 < ne[1]; i1++) {
                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd);
+                    ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
                }
            }
            break;
        case 3:
            scale /= sqrtf(ne[0]+ne[1]);
            for (int i2 = 0; i2 < ne[2]; i2++) {
                for (int i1 = 0; i1 < ne[1]; i1++) {
                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
+                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
                    }
                }
            }
            break;
        case 4:
            scale /= sqrtf(ne[0]+ne[1]);
            for (int i3 = 0; i3 < ne[3]; i3++) {
                for (int i2 = 0; i2 < ne[2]; i2++) {
                    for (int i1 = 0; i1 < ne[1]; i1++) {
                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
+                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
                        }
                    }
                }
@ -148,8 +153,8 @@ struct llama_hparams_lora {
    uint32_t n_rot   = 64;
    uint32_t n_lora  = 64;
-    bool operator!=(const llama_hparams & other) const {
+    bool operator!=(const llama_hparams_lora & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams));
+        return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
    }
 };
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -331,6 +331,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
 #else
      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
 #ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
@ -479,6 +485,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
    fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
 #endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
@ -528,6 +535,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
    lparams.n_gpu_layers = params.n_gpu_layers;
    lparams.main_gpu     = params.main_gpu;
    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
    lparams.low_vram     = params.low_vram;
    lparams.seed         = params.seed;
    lparams.f16_kv       = params.memory_f16;
    lparams.use_mmap     = params.use_mmap;
@ -632,6 +640,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
            case CONSOLE_COLOR_USER_INPUT:
                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
                break;
            case CONSOLE_COLOR_ERROR:
                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
                break;
        }
        con_st.color = color;
        fflush(con_st.out);
--- a/examples/common.h
+++ b/examples/common.h
@ -21,15 +21,16 @@
 int32_t get_num_physical_cores();
 struct gpt_params {
-    int32_t seed                           = -1;   // RNG seed
+    int32_t seed                            = -1;  // RNG seed
-    int32_t n_threads                      = get_num_physical_cores();
+    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_predict                      = -1;   // new tokens to predict
+    int32_t n_predict                       = -1;  // new tokens to predict
-    int32_t n_ctx                          = 512;  // context size
+    int32_t n_ctx                           = 512; // context size
-    int32_t n_batch                        = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                         = 0;    // number of tokens to keep from initial prompt
+    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers                   = 0;    // number of layers to store in VRAM
+    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
-    int32_t main_gpu                       = 0;    // the GPU that is used for scratch and small tensors
+    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
    bool    low_vram                        = 0;   // if true, reduce VRAM usage at the cost of performance
    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@ -112,7 +113,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
 enum console_color_t {
    CONSOLE_COLOR_DEFAULT=0,
    CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT
+    CONSOLE_COLOR_USER_INPUT,
    CONSOLE_COLOR_ERROR
 };
 struct console_state {
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -81,6 +81,9 @@ int main(int argc, char ** argv) {
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
                "expect poor results\n", __func__, params.n_ctx);
    } else if (params.n_ctx < 8) {
        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@ -328,9 +331,29 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    // do one empty run to warm up the model
    {
        const std::vector<llama_token> tmp = { llama_token_bos(), };
        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(ctx);
    }
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (embd.size() > 0) {
            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
            auto max_embd_size = n_ctx - 4;
            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int)embd.size() > max_embd_size) {
                auto skipped_tokens = embd.size() - max_embd_size;
                console_set_color(con_st, CONSOLE_COLOR_ERROR);
                printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
                fflush(stdout);
                embd.resize(max_embd_size);
            }
            // infinite text generation via context swapping
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -3,43 +3,136 @@
 #include "llama.h"
 #include <cstdio>
-#include <map>
+#include <cstring>
 #include <vector>
 #include <string>
-static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
+struct quant_option {
-  {"q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0},
+    std::string name;
-  {"q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1},
+    llama_ftype ftype;
-  {"q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0},
+    std::string desc;
  {"q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1},
  {"q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0},
  {"q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K},
  {"q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M},
  {"q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S},
  {"q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M},
  {"q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L},
  {"q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M},
  {"q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S},
  {"q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M},
  {"q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M},
  {"q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S},
  {"q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M},
  {"q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K},
 };
-bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
+static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    auto it = LLAMA_FTYPE_MAP.find(ftype_str);
+    {
-    if (it != LLAMA_FTYPE_MAP.end()) {
+        "Q4_0",
-        ftype = it->second;
+        LLAMA_FTYPE_MOSTLY_Q4_0,
-        ftype_str_out = it->first;
+        " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
-        return true;
+    },
    {
        "Q4_1",
        LLAMA_FTYPE_MOSTLY_Q4_1,
        " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
    },
    {
        "Q5_0",
        LLAMA_FTYPE_MOSTLY_Q5_0,
        " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
    },
    {
        "Q5_1",
        LLAMA_FTYPE_MOSTLY_Q5_1,
        " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
    },
 #ifdef GGML_USE_K_QUANTS
    {
        "Q2_K",
        LLAMA_FTYPE_MOSTLY_Q2_K,
        " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
    },
    {
        "Q3_K",
        LLAMA_FTYPE_MOSTLY_Q3_K_M,
        "alias for Q3_K_M"
    },
    {
        "Q3_K_S",
        LLAMA_FTYPE_MOSTLY_Q3_K_S,
        " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
    },
    {
        "Q3_K_M",
        LLAMA_FTYPE_MOSTLY_Q3_K_M,
        " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
    },
    {
        "Q3_K_L",
        LLAMA_FTYPE_MOSTLY_Q3_K_L,
        " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
    },
    {
        "Q4_K",
        LLAMA_FTYPE_MOSTLY_Q4_K_M,
        "alias for Q4_K_M",
    },
    {
        "Q4_K_S",
        LLAMA_FTYPE_MOSTLY_Q4_K_S,
        " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
    },
    {
        "Q4_K_M",
        LLAMA_FTYPE_MOSTLY_Q4_K_M,
        " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
    },
    {
        "Q5_K",
        LLAMA_FTYPE_MOSTLY_Q5_K_M,
        "alias for Q5_K_M",
    },
    {
        "Q5_K_S",
        LLAMA_FTYPE_MOSTLY_Q5_K_S,
        " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
    },
    {
        "Q5_K_M",
        LLAMA_FTYPE_MOSTLY_Q5_K_M,
        " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
    },
    {
        "Q6_K",
        LLAMA_FTYPE_MOSTLY_Q6_K,
        " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
    },
 #endif
    {
        "Q8_0",
        LLAMA_FTYPE_MOSTLY_Q8_0,
        " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
    },
    {
        "F16",
        LLAMA_FTYPE_MOSTLY_F16,
        "13.00G              @ 7B - extremely large, virtually no quality loss - not recommended",
    },
    {
        "F32",
        LLAMA_FTYPE_ALL_F32,
        "26.00G              @ 7B - absolutely huge, lossless - not recommended",
    },
 };
 bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;
    for (auto ch : ftype_str_in) {
        ftype_str.push_back(std::toupper(ch));
    }
    for (auto & it : QUANT_OPTIONS) {
        if (it.name == ftype_str) {
            ftype = it.ftype;
            ftype_str_out = it.name;
            return true;
        }
    }
    // try to parse as an integer
    try {
        int ftype_int = std::stoi(ftype_str);
-        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
+        for (auto & it : QUANT_OPTIONS) {
-            if (it->second == ftype_int) {
+            if (it.ftype == ftype_int) {
-                ftype = it->second;
+                ftype = it.ftype;
-                ftype_str_out = it->first;
+                ftype_str_out = it.name;
                return true;
            }
        }
@ -51,29 +144,51 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
 }
 // usage:
-//  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
 void usage(const char * executable) {
    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
    }
    exit(1);
 }
 int main(int argc, char ** argv) {
    if (argc < 3) {
-        fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
+        usage(argv[0]);
-        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
+    }
-            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+
    llama_model_quantize_params params = llama_model_quantize_default_params();
    int arg_idx = 1;
    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
            params.quantize_output_tensor = false;
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else {
            usage(argv[0]);
        }
-        return 1;
+    }
    if (argc - arg_idx < 3) {
        usage(argv[0]);
    }
    llama_init_backend();
    // parse command line arguments
-    const std::string fname_inp = argv[1];
+    const std::string fname_inp = argv[arg_idx];
    arg_idx++;
    std::string fname_out;
    int nthread;
    llama_ftype ftype;
    int arg_idx = 2;
    std::string ftype_str;
-    if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
+    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        // argv[2] is the ftype
        std::string fpath;
        const size_t pos = fname_inp.find_last_of('/');
        if (pos != std::string::npos) {
@ -84,7 +199,6 @@ int main(int argc, char ** argv) {
        arg_idx++;
    }
    else {
        // argv[2] is the output path
        fname_out = argv[arg_idx];
        arg_idx++;
@ -92,8 +206,7 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: missing ftype\n", __func__);
            return 1;
        }
-        // argv[3] is the ftype
+        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
            return 1;
        }
@ -103,21 +216,19 @@ int main(int argc, char ** argv) {
    // parse nthreads
    if (argc > arg_idx) {
        try {
-            nthread = std::stoi(argv[arg_idx]);
+            params.nthread = std::stoi(argv[arg_idx]);
        }
        catch (const std::exception & e) {
            fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
            return 1;
        }
    } else {
        nthread = 0;
    }
    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
-    if (nthread > 0) {
+    if (params.nthread > 0) {
-        fprintf(stderr, " using %d threads", nthread);
+        fprintf(stderr, " using %d threads", params.nthread);
    }
    fprintf(stderr, "\n");
@ -129,7 +240,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = llama_time_us();
-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -289,6 +289,7 @@ Test();
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
 -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
 -   `--port`: Set the port to listen. Default: `8080`.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
  fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
 #endif
  fprintf(stderr, "  -m FNAME, --model FNAME\n");
  fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
@ -537,6 +538,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
      }
 #else
      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
    }
    else if (arg == "--low-vram" || arg == "-lv")
    {
 #ifdef GGML_USE_CUBLAS
      params.low_vram = true;
 #else
      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
    }
    else if (arg == "--main-gpu" || arg == "-mg")
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ b/examples/train-text-from-scratch/CMakeLists.txt
@ -0,0 +1,4 @@
 set(TARGET train-text-from-scratch)
 add_executable(${TARGET} train-text-from-scratch.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@ -0,0 +1,22 @@
 # train-text-from-scratch
 Basic usage instructions:
 ```bash
 # get training data
 wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt
 # train
 ./bin/train-text-from-scratch \
        --vocab-model ../models/ggml-vocab.bin \
        --ctx 64 --embd 256 --head 8 --layer 16 \
        --checkpoint-in  chk-shakespeare-256x16.bin \
        --checkpoint-out chk-shakespeare-256x16.bin \
        --model-out ggml-shakespeare-256x16-f32.bin \
        --train-data "shakespeare.txt" \
        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
        --print-details-interval 0 --predict 16 --use-flash
 # predict
 ./bin/main -m ggml-shakespeare-256x16-f32.bin
 ```
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
--- a/flake.nix
+++ b/flake.nix
@ -28,7 +28,7 @@
          postPatch =
            if isM1 then ''
              substituteInPlace ./ggml-metal.m \
-                --replace '[[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
+                --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
            '' else "";
          nativeBuildInputs = with pkgs; [ cmake ];
          buildInputs = osSpecific;
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -24,11 +24,14 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
-void   ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
+void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
 void   ggml_cuda_free_data(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 void   ggml_cuda_set_main_device(int main_device);
 void   ggml_cuda_set_scratch_size(size_t scratch_size);
 void   ggml_cuda_free_scratch(void);
 bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -45,15 +45,26 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(scale);
    GGML_METAL_DECL_KERNEL(silu);
    GGML_METAL_DECL_KERNEL(relu);
    GGML_METAL_DECL_KERNEL(gelu);
    GGML_METAL_DECL_KERNEL(soft_max);
    GGML_METAL_DECL_KERNEL(diag_mask_inf);
    GGML_METAL_DECL_KERNEL(get_rows_f16);
    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
    GGML_METAL_DECL_KERNEL(get_rows_q2_k);
    GGML_METAL_DECL_KERNEL(get_rows_q3_k);
    GGML_METAL_DECL_KERNEL(get_rows_q4_k);
    GGML_METAL_DECL_KERNEL(get_rows_q5_k);
    GGML_METAL_DECL_KERNEL(get_rows_q6_k);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
    GGML_METAL_DECL_KERNEL(rope);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
@ -66,6 +77,12 @@ struct ggml_metal_context {
 //       for now it is easier to work in a separate file
 static NSString * const msl_library_source = @"see metal.metal";
 // Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject
@end
@implementation GGMLMetalClass
@end
 struct ggml_metal_context * ggml_metal_init(void) {
    fprintf(stderr, "%s: allocating\n", __func__);
@ -73,6 +90,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    // determine if we can use MPS
    if (MPSSupportsMTLDevice(ctx->device)) {
@ -101,7 +119,8 @@ struct ggml_metal_context * ggml_metal_init(void) {
        NSError * error = nil;
        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"];
+        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
        NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
        fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
@ -131,15 +150,26 @@ struct ggml_metal_context * ggml_metal_init(void) {
        GGML_METAL_ADD_KERNEL(scale);
        GGML_METAL_ADD_KERNEL(silu);
        GGML_METAL_ADD_KERNEL(relu);
        GGML_METAL_ADD_KERNEL(gelu);
        GGML_METAL_ADD_KERNEL(soft_max);
        GGML_METAL_ADD_KERNEL(diag_mask_inf);
        GGML_METAL_ADD_KERNEL(get_rows_f16);
        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
        GGML_METAL_ADD_KERNEL(get_rows_q2_k);
        GGML_METAL_ADD_KERNEL(get_rows_q3_k);
        GGML_METAL_ADD_KERNEL(get_rows_q4_k);
        GGML_METAL_ADD_KERNEL(get_rows_q5_k);
        GGML_METAL_ADD_KERNEL(get_rows_q6_k);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
        GGML_METAL_ADD_KERNEL(rope);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
@ -412,6 +442,20 @@ void ggml_metal_graph_compute(
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
            case GGML_OP_GELU:
            {
                    if (encoder == nil) {
                        encoder = [command_buffer computeCommandEncoder];
                    }
                    [encoder setComputePipelineState:ctx->pipeline_gelu];
                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
            } break;
            case GGML_OP_SOFT_MAX:
                {
                    if (encoder == nil) {
@ -518,9 +562,36 @@ void ggml_metal_graph_compute(
                                    GGML_ASSERT(ne12 == 1);
                                    nth0 = 8;
-                                    nth1 = 4;
+                                    nth1 = 8;
                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
                                } break;
                            case GGML_TYPE_Q4_1:
                                {
                                    GGML_ASSERT(ne02 == 1);
                                    GGML_ASSERT(ne12 == 1);
                                    nth0 = 8;
                                    nth1 = 8;
                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
                                } break;
                            case GGML_TYPE_Q2_K:
                                {
                                    GGML_ASSERT(ne02 == 1);
                                    GGML_ASSERT(ne12 == 1);
                                    nth0 = 4;
                                    nth1 = 16;
                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
                                } break;
                            case GGML_TYPE_Q3_K:
                                {
                                    GGML_ASSERT(ne02 == 1);
                                    GGML_ASSERT(ne12 == 1);
                                    nth0 = 4;
                                    nth1 = 16;
                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
                                } break;
                            case GGML_TYPE_Q4_K:
                                {
                                    GGML_ASSERT(ne02 == 1);
@ -530,6 +601,24 @@ void ggml_metal_graph_compute(
                                    nth1 = 16;
                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
                                } break;
                            case GGML_TYPE_Q5_K:
                                {
                                    GGML_ASSERT(ne02 == 1);
                                    GGML_ASSERT(ne12 == 1);
                                    nth0 = 4;
                                    nth1 = 16;
                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
                                } break;
                            case GGML_TYPE_Q6_K:
                                {
                                    GGML_ASSERT(ne02 == 1);
                                    GGML_ASSERT(ne12 == 1);
                                    nth0 = 4;
                                    nth1 = 16;
                                    [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
                                } break;
                            default:
                                {
                                    fprintf(stderr, "Asserting on type %d\n",(int)src0t);
@ -554,12 +643,17 @@ void ggml_metal_graph_compute(
                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
-                        if (src0t == GGML_TYPE_Q4_0) {
+                        if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
                            [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                        } else if (src0t == GGML_TYPE_Q4_K) {
+                        }
                        else if (src0t == GGML_TYPE_Q2_K ||
                                 src0t == GGML_TYPE_Q3_K ||
                                 src0t == GGML_TYPE_Q4_K ||
                                 src0t == GGML_TYPE_Q5_K ||
                                 src0t == GGML_TYPE_Q6_K) {
                            [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                        } else {
                            [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -575,7 +669,12 @@ void ggml_metal_graph_compute(
                    switch (src0->type) {
                        case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
                        case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                        case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
                        case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
                        case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
                        case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
                        case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
                        case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
                        default: GGML_ASSERT(false && "not implemented");
                    }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -662,6 +662,15 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
    clReleaseMemObject(mem);
 }
 void ggml_cl_free_data(const struct ggml_tensor* tensor) {
    if (tensor->backend != GGML_BACKEND_GPU) {
        return;
    }
    cl_mem mem = (cl_mem)tensor->data;
    clReleaseMemObject(mem);
 }
 static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
    cl_int err;
    const uint64_t ne0 = src->ne[0];
@ -1158,7 +1167,7 @@ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct g
    return 0;
 }
-void ggml_cl_transform_tensor(ggml_tensor * tensor) {
+void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
    const int64_t ne0 = tensor->ne[0];
    const int64_t ne1 = tensor->ne[1];
    const int64_t ne2 = tensor->ne[2];
@ -1170,6 +1179,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
    size_t q_size;
    cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
    tensor->data = data;
    // copy tensor to device
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
@ -1181,35 +1191,5 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
    CL_CHECK(clFinish(queue));
    tensor->data = dst;
-    tensor->backend = GGML_BACKEND_GPU;
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
 void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
    cl_int err;
    FILE * fp = fopen(fname, "rb");
    const size_t size = ggml_nbytes(tensor);
    cl_mem dst;
    CL_CHECK((dst = clCreateBuffer(context, CL_MEM_READ_ONLY, size, nullptr, &err), err));
    void * buf_host = malloc(size);
 #ifdef _WIN32
    int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
 #else
    int ret = fseek(fp, (long) offset, SEEK_SET);
 #endif
    GGML_ASSERT(ret == 0); // same
    size_t ret2 = fread(buf_host, size, 1, fp);
    if (ret2 != 1) {
        fprintf(stderr, "unexpectedly reached end of file");
        exit(1);
    }
    clEnqueueWriteBuffer(queue, dst, CL_TRUE, 0, size, buf_host, 0, nullptr, nullptr);
    tensor->data = dst;
    free(buf_host);
    fclose(fp);
 }
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -16,8 +16,9 @@ void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor
 void * ggml_cl_host_malloc(size_t size);
 void   ggml_cl_host_free(void * ptr);
-void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
+void ggml_cl_free_data(const struct ggml_tensor* tensor);
-void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, size_t offset);
+
 void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -296,6 +296,7 @@ extern "C" {
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
        GGML_OP_ABS,
        GGML_OP_SGN,
        GGML_OP_NEG,
@ -309,6 +310,7 @@ extern "C" {
        GGML_OP_RMS_NORM_BACK,
        GGML_OP_MUL_MAT,
        GGML_OP_OUT_PROD,
        GGML_OP_SCALE,
        GGML_OP_SET,
@ -324,6 +326,7 @@ extern "C" {
        GGML_OP_DIAG_MASK_INF,
        GGML_OP_DIAG_MASK_ZERO,
        GGML_OP_SOFT_MAX,
        GGML_OP_SOFT_MAX_BACK,
        GGML_OP_ROPE,
        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
@ -333,10 +336,14 @@ extern "C" {
        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
        GGML_OP_FLASH_ATTN_BACK,
        GGML_OP_MAP_UNARY,
        GGML_OP_MAP_BINARY,
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_COUNT,
    };
@ -478,6 +485,7 @@ extern "C" {
    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
@ -574,6 +582,11 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_add1_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_acc(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -645,6 +658,11 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@ -698,14 +716,22 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-    // A: m rows, n columns
+    // A: n columns, m rows
-    // B: p rows, n columns (i.e. we transpose it internally)
+    // B: n columns, p rows  (i.e. we transpose it internally)
    // result is m columns, p rows
    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // A: m columns, n rows,
    // B: p columns, n rows,
    // result is m columns, p rows
    GGML_API struct ggml_tensor * ggml_out_prod(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    //
    // operations on tensors without backpropagation
    //
@ -916,6 +942,17 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_soft_max_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // rotary position embedding
    // if mode & 1 == 1, skip n_past elements
    // if mode & 2 == 1, GPT-NeoX style
@ -982,6 +1019,14 @@ extern "C" {
            struct ggml_tensor  * v,
            bool                  masked);
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,
           struct ggml_tensor  * q,
           struct ggml_tensor  * k,
           struct ggml_tensor  * v,
           struct ggml_tensor  * d,
           bool                  masked);
    GGML_API struct ggml_tensor * ggml_flash_ff(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1005,6 +1050,19 @@ extern "C" {
            struct ggml_tensor          * b,
                   ggml_binary_op_f32_t   fun);
    // loss function
    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b);
    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
            struct ggml_tensor          * c);
    //
    // automatic differentiation
    //
@ -1099,6 +1157,8 @@ extern "C" {
        struct {
            int n_iter;
            float sched; // schedule multiplier (fixed, decay or warmup)
            float decay; // weight decay for AdamW, use 0.0f to disable
            float alpha; // learning rate
            float beta1;
            float beta2;
@ -1123,6 +1183,49 @@ extern "C" {
        } lbfgs;
    };
    struct ggml_opt_context {
        struct ggml_context * ctx;
        struct ggml_opt_params params;
        int iter;
        int64_t nx; // number of parameter elements
        bool just_initialized;
        struct {
            struct ggml_tensor * x;  // view of the parameters
            struct ggml_tensor * g1; // gradient
            struct ggml_tensor * g2; // gradient squared
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
            struct ggml_tensor * mh; // first moment hat
            struct ggml_tensor * vh; // second moment hat
            struct ggml_tensor * pf; // past function values
            float fx_best;
            float fx_prev;
            int n_no_improvement;
        } adam;
        struct {
            struct ggml_tensor * x;    // current parameters
            struct ggml_tensor * xp;   // previous parameters
            struct ggml_tensor * g;    // current gradient
            struct ggml_tensor * gp;   // previous gradient
            struct ggml_tensor * d;    // search direction
            struct ggml_tensor * pf;   // past function values
            struct ggml_tensor * lmal; // the L-BFGS memory alpha
            struct ggml_tensor * lmys; // the L-BFGS memory ys
            struct ggml_tensor * lms;  // the L-BFGS memory s
            struct ggml_tensor * lmy;  // the L-BFGS memory y
            float fx_best;
            float step;
            int j;
            int k;
            int end;
            int n_no_improvement;
        } lbfgs;
    };
    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
    // optimize the function defined by the tensor f
@ -1131,6 +1234,27 @@ extern "C" {
            struct ggml_opt_params params,
            struct ggml_tensor * f);
    // initialize optimizer context
    GGML_API void ggml_opt_init(
            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
            struct ggml_opt_params params,
            int64_t nx);
    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume(
            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
            struct ggml_tensor * f);
    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume_g(
            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
            struct ggml_tensor * f,
            struct ggml_cgraph * gf,
            struct ggml_cgraph * gb);
    //
    // quantization
    //
--- a/k_quants.c
+++ b/k_quants.c
@ -1519,7 +1519,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
    const uint8x16_t m4b = vdupq_n_u8(0xf);
 #ifdef __ARM_FEATURE_DOTPROD
-    const uint32x4_t mzero = vdupq_n_s32(0);
+    const int32x4_t mzero = vdupq_n_s32(0);
 #endif
    int8x16x2_t q4bytes;
@ -1745,7 +1745,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON
    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint32x4_t mzero = vdupq_n_u32(0);
+    const int32x4_t mzero = vdupq_n_s32(0);
    const uint8x16_t mone = vdupq_n_u8(1);
    const uint8x16_t mtwo = vdupq_n_u8(2);
@ -2242,5 +2242,3 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    *s = sumf;
 #endif
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -165,6 +165,11 @@ struct llama_kv_cache {
        if (ctx) {
            ggml_free(ctx);
        }
 #ifdef GGML_USE_CUBLAS
        ggml_cuda_free_data(k);
        ggml_cuda_free_data(v);
 #endif // GGML_USE_CUBLAS
    }
 };
@ -210,7 +215,12 @@ struct llama_model {
        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
            ggml_cuda_free_data(tensors_by_name[i].second);
        }
-#endif // GGML_USE_CUBLAS
+        ggml_cuda_free_scratch();
 #elif defined(GGML_USE_CLBLAST)
        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
            ggml_cl_free_data(tensors_by_name[i].second);
        }
 #endif
    }
 };
@ -703,6 +713,9 @@ struct llama_model_loader {
    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
        struct ggml_tensor * tensor;
        if (backend != GGML_BACKEND_CPU) {
            ggml_set_no_alloc(ggml_ctx, true);
        }
        if (lt.ne.size() == 2) {
            tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
        } else {
@ -712,6 +725,9 @@ struct llama_model_loader {
        ggml_set_name(tensor, lt.name.c_str());
        LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
        if (backend != GGML_BACKEND_CPU) {
            ggml_set_no_alloc(ggml_ctx, use_mmap);
        }
        tensor->backend = backend;
        lt.ggml_tensor = tensor;
        num_ggml_tensors_created++;
@ -727,6 +743,7 @@ struct llama_model_loader {
    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
        size_t data_size = 0;
        size_t prefetch_size = 0;
        size_t lock_size = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            data_size += lt.size;
            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@ -736,11 +753,6 @@ struct llama_model_loader {
        if (use_mmap) {
            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
            if (!lmlock) {
                // Don't call the callback since the actual loading will be lazy
                // and we can't measure it.
                progress_callback = NULL;
            }
            if (lmlock) {
                lmlock->init(mapping->addr);
            }
@ -748,20 +760,49 @@ struct llama_model_loader {
        size_t done_size = 0;
        for (llama_load_tensor & lt : tensors_map.tensors) {
            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
                continue;
            }
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
            LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
            lt.data = (uint8_t *) lt.ggml_tensor->data;
-            load_data_for(lt);
+
-            lt.ggml_tensor->data = lt.data;
+            // allocate temp buffer if not using mmap
-            done_size += lt.size;
+            if (!use_mmap && lt.data == NULL) {
-            if (use_mmap && lmlock) {
+                GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
-                lmlock->grow_to(done_size);
+                lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
            }
            load_data_for(lt);
            switch(lt.ggml_tensor->backend) {
                case GGML_BACKEND_CPU:
                    lt.ggml_tensor->data = lt.data;
                    if (use_mmap && lmlock) {
                        lock_size += lt.size;
                        lmlock->grow_to(lock_size);
                    }
                    break;
 #if defined(GGML_USE_CUBLAS)
                case GGML_BACKEND_GPU:
                case GGML_BACKEND_GPU_SPLIT:
                    ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
                    if (!use_mmap) {
                        free(lt.data);
                    }
                    break;
 #elif defined(GGML_USE_CLBLAST)
                case GGML_BACKEND_GPU:
                    ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
                    if (!use_mmap) {
                        free(lt.data);
                    }
                    break;
 #endif
                default:
                    continue;
            }
            done_size += lt.size;
        }
    }
@ -832,7 +873,8 @@ static bool kv_cache_init(
        const struct llama_hparams & hparams,
             struct llama_kv_cache & cache,
                         ggml_type   wtype,
-                               int   n_ctx) {
+                               int   n_ctx,
                               int   n_gpu_layers) {
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;
@ -858,6 +900,15 @@ static bool kv_cache_init(
    ggml_set_name(cache.k, "cache_k");
    ggml_set_name(cache.v, "cache_v");
 #ifdef GGML_USE_CUBLAS
    if (n_gpu_layers > n_layer + 1) {
        ggml_cuda_assign_buffers_no_scratch(cache.v);
    }
    if (n_gpu_layers > n_layer + 2) {
        ggml_cuda_assign_buffers_no_scratch(cache.k);
    }
 #endif // GGML_USE_CUBLAS
    return true;
 }
@ -868,6 +919,7 @@ struct llama_context_params llama_context_default_params() {
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ {0},
        /*.low_vram                    =*/ false,
        /*.seed                        =*/ -1,
        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
@ -882,6 +934,17 @@ struct llama_context_params llama_context_default_params() {
    return result;
 }
 struct llama_model_quantize_params llama_model_quantize_default_params() {
    struct llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.allow_requantize            =*/ false,
        /*.quantize_output_tensor      =*/ true,
    };
    return result;
 }
 bool llama_mmap_supported() {
    return llama_mmap::SUPPORTED;
 }
@ -965,6 +1028,7 @@ static void llama_model_load_internal(
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -990,6 +1054,12 @@ static void llama_model_load_internal(
            case 40: model.type = e_model::MODEL_13B; break;
            case 60: model.type = e_model::MODEL_30B; break;
            case 80: model.type = e_model::MODEL_65B; break;
            default:
                {
                    if (hparams.n_layer < 32) {
                        model.type = e_model::MODEL_7B;
                    }
                } break;
        }
        hparams.n_ctx = n_ctx;
@ -1085,18 +1155,34 @@ static void llama_model_load_internal(
        ml->ggml_ctx = ctx;
        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
        model.norm           = ml->get_tensor("norm.weight",           {n_embd},          GGML_BACKEND_CPU);
        // "output" tensor
        {
            ggml_backend backend_norm;
            ggml_backend backend_output;
            if (n_gpu_layers > int(n_layer)) { // NOLINT
                // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
 #else
                backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
 #endif // _WIN32
                backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
            } else {
                backend_norm = GGML_BACKEND_CPU;
                backend_output = GGML_BACKEND_CPU;
            }
            model.norm   = ml->get_tensor("norm.weight",   {n_embd},          backend_norm);
            model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
            if (backend_norm == GGML_BACKEND_GPU) {
                vram_weights += ggml_nbytes(model.norm);
            }
            if (backend_output == GGML_BACKEND_GPU_SPLIT) {
                vram_weights += ggml_nbytes(model.output);
            }
        }
        const int i_gpu_start = n_layer - n_gpu_layers;
@ -1126,7 +1212,7 @@ static void llama_model_load_internal(
            if (backend == GGML_BACKEND_GPU) {
                vram_weights +=
                    ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
-                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
+                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
                    ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
            }
        }
@ -1154,23 +1240,49 @@ static void llama_model_load_internal(
                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
        (void) vram_scratch;
        (void) n_batch;
 #ifdef GGML_USE_CUBLAS
-        vram_scratch = n_batch * MB;
+        if (low_vram) {
-        ggml_cuda_set_scratch_size(vram_scratch);
+            fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
-        if (n_gpu_layers > 0) {
+            ggml_cuda_set_scratch_size(0); // disable scratch
-            fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
+        } else {
-                    __func__, vram_scratch / MB);
+            vram_scratch = n_batch * MB;
            ggml_cuda_set_scratch_size(vram_scratch);
            if (n_gpu_layers > 0) {
                fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
                        __func__, vram_scratch / MB);
            }
        }
 #endif // GGML_USE_CUBLAS
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-        fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
+        fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
        if (n_gpu_layers > (int) hparams.n_layer) {
-            fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
+            fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
        }
        size_t vram_kv_cache = 0;
        if (n_gpu_layers > (int) hparams.n_layer + 1) {
            if (low_vram) {
                fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
            } else {
                fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
                vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
            }
        }
        if (n_gpu_layers > (int) hparams.n_layer + 2) {
            if (low_vram) {
                fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
            } else {
                fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
                vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
            }
        }
        const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
        fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
                __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
        fprintf(stderr, "%s: total VRAM used: %zu MB\n",
-                __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up
+                __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
 #else
        (void) n_gpu_layers;
 #endif
@ -1181,58 +1293,15 @@ static void llama_model_load_internal(
        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
    }
-    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+    (void) tensor_split;
 #if defined(GGML_USE_CUBLAS)
    {
        ggml_cuda_set_tensor_split(tensor_split);
        size_t done_size = 0;
        size_t data_size = 0;
        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
            data_size += lt.size;
            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
                done_size += lt.size;
            }
        }
        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
            ggml_backend backend = lt.ggml_tensor->backend;
            if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
                continue;
            }
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
            ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
            done_size += lt.size;
        }
    }
 #elif defined(GGML_USE_CLBLAST)
    {
        size_t done_size = 0;
        size_t data_size = 0;
        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
            data_size += lt.size;
            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
                done_size += lt.size;
            }
        }
        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
            if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
                continue;
            }
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
            ggml_cl_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
            done_size += lt.size;
        }
    }
 #else
    (void) n_batch;
    (void) tensor_split;
 #endif
    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
    if (progress_callback) {
        progress_callback(1.0f, progress_callback_user_data);
    }
@ -1252,6 +1321,7 @@ static bool llama_model_load(
        int n_gpu_layers,
        int main_gpu,
        float * tensor_split,
        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1259,7 +1329,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
+        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@ -1335,12 +1405,33 @@ static bool llama_eval_internal(
    const int i_gpu_start = n_layer - n_gpu_layers;
    (void) i_gpu_start;
    // offload functions set the tensor output backend to GPU
    // tensors are GPU-accelerated if any input or the output has been offloaded
    //
    // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
    // in that case ggml_cuda_assign_buffers has no effect
    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
    offload_func_t offload_func_kq = llama_nop;
    offload_func_t offload_func_v  = llama_nop;
 #ifdef GGML_USE_CUBLAS
        if (n_gpu_layers > n_layer) {
            offload_func_nr = ggml_cuda_assign_buffers;
        }
        if (n_gpu_layers > n_layer + 1) {
            offload_func_v  = ggml_cuda_assign_buffers;
        }
        if (n_gpu_layers > n_layer + 2) {
            offload_func_kq = ggml_cuda_assign_buffers;
        }
 #endif // GGML_USE_CUBLAS
    for (int il = 0; il < n_layer; ++il) {
        offload_func_t offload_func = llama_nop;
 #ifdef GGML_USE_CUBLAS
        if (il >= i_gpu_start) {
-            offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
+            offload_func = ggml_cuda_assign_buffers;
        }
 #endif // GGML_USE_CUBLAS
@ -1363,31 +1454,42 @@ static bool llama_eval_internal(
        // self-attention
        {
            // compute Q and K and RoPE them
            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
            // offload_func(tmpq);
            ggml_set_name(tmpq, "tmpq");
            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            // offload_func(tmpk);
+            offload_func_kq(tmpk);
            ggml_set_name(tmpk, "tmpk");
            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
            offload_func_kq(tmpq);
            ggml_set_name(tmpq, "tmpq");
            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
            offload_func_kq(Kcur);
            ggml_set_name(Kcur, "Kcur");
            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
            offload_func_kq(Qcur);
            ggml_set_name(Qcur, "Qcur");
            // store key and value to memory
            {
                // compute the transposed [N, n_embd] V matrix
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
+
                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                offload_func_v(tmpv);
                ggml_set_name(tmpv, "tmpv");
                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
                offload_func_v(Vcur);
                ggml_set_name(Vcur, "Vcur");
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
                offload_func_kq(k);
                ggml_set_name(k, "k");
                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
                        (   n_ctx)*ggml_element_size(kv_self.v),
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
                offload_func_v(v);
                ggml_set_name(v, "v");
                // important: storing RoPE-ed version of K in the KV cache!
@ -1399,6 +1501,7 @@ static bool llama_eval_internal(
                ggml_permute(ctx0,
                        Qcur,
                        0, 2, 1, 3);
            offload_func_kq(Q);
            ggml_set_name(Q, "Q");
            struct ggml_tensor * K =
@ -1407,10 +1510,12 @@ static bool llama_eval_internal(
                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
                            n_embd/n_head, n_head, n_past + N),
                        0, 2, 1, 3);
            offload_func_kq(K);
            ggml_set_name(K, "K");
            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
            offload_func_kq(KQ);
            ggml_set_name(KQ, "KQ");
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
@ -1419,14 +1524,17 @@ static bool llama_eval_internal(
            // KQ_scaled shape [n_past + N, N, n_head, 1]
            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
            offload_func_kq(KQ_scaled);
            ggml_set_name(KQ_scaled, "KQ_scaled");
            // KQ_masked = mask_past(KQ_scaled)
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
            offload_func_kq(KQ_masked);
            ggml_set_name(KQ_masked, "KQ_masked");
            // KQ = soft_max(KQ_masked)
            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
            offload_func_v(KQ_soft_max);
            ggml_set_name(KQ_soft_max, "KQ_soft_max");
            // split cached V into n_head heads
@ -1436,10 +1544,12 @@ static bool llama_eval_internal(
                        n_ctx*ggml_element_size(kv_self.v),
                        n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
                        il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
            offload_func_v(V);
            ggml_set_name(V, "V");
 #if 1
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
            offload_func_v(KQV);
            ggml_set_name(KQV, "KQV");
 #else
            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@ -1451,12 +1561,14 @@ static bool llama_eval_internal(
            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
            offload_func_v(KQV_merged);
            ggml_set_name(KQV_merged, "KQV_merged");
            // cur = KQV_merged.contiguous().view(n_embd, N)
            cur = ggml_cpy(ctx0,
                    KQV_merged,
                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
            offload_func_v(cur);
            ggml_set_name(cur, "KQV_merged_contiguous");
            // projection (no bias)
@ -1468,7 +1580,6 @@ static bool llama_eval_internal(
        }
        lctx.use_buf(ctx0, 1);
        //ggml_cuda_set_scratch(1);
        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
        offload_func(inpFF);
@ -1526,32 +1637,24 @@ static bool llama_eval_internal(
    }
    lctx.use_buf(ctx0, 0);
    //ggml_cuda_set_scratch(0);
    // used at the end to optionally extract the embeddings
    struct ggml_tensor * embeddings = NULL;
    offload_func_t offload_func = llama_nop;
 #ifdef GGML_USE_CUBLAS
        if (n_gpu_layers > n_layer) {
            offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
        }
 #endif // GGML_USE_CUBLAS
    // norm
    {
        cur = ggml_rms_norm(ctx0, inpL);
-        offload_func(cur);
+        offload_func_nr(cur);
        ggml_set_name(cur, "rms_norm_inpL");
        cur = ggml_rms_norm(ctx0, cur);
-        offload_func(cur);
+        offload_func_nr(cur);
        ggml_set_name(cur, "rms_norm_after");
        // cur = cur*norm(broadcasted)
        cur = ggml_mul(ctx0, cur, model.norm);
-        offload_func(cur);
+        offload_func_nr(cur);
        ggml_set_name(cur, "result_norm");
        embeddings = cur;
@ -2159,6 +2262,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
        return -log2f(candidate.p) > *mu;
    }));
    if (candidates->size == 0) {
        candidates->size = 1;
    }
    // Normalize the probabilities of the remaining words
    llama_sample_softmax(ctx, candidates);
@ -2227,15 +2334,79 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
 // quantization
 //
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
+static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
    if (output.size < nelements * sizeof(float)) {
        output.resize(nelements * sizeof(float));
    }
    float * f32_output = (float *) output.addr;
    quantize_fns_t qtype;
    if (ggml_is_quantized(tensor.type)) {
        qtype = ggml_internal_get_quantize_fn(tensor.type);
        if (qtype.dequantize_row_q == NULL) {
            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
        }
    } else if (tensor.type != GGML_TYPE_F16) {
        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
    }
    if (nthread < 2) {
        if (tensor.type == GGML_TYPE_F16) {
            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
        } else if (ggml_is_quantized(tensor.type)) {
            qtype.dequantize_row_q(tensor.data, f32_output, nelements);
        } else {
            LLAMA_ASSERT(false); // unreachable
        }
        return;
    }
    auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
    auto block_size_bytes = ggml_type_size(tensor.type);
    LLAMA_ASSERT(nelements % block_size == 0);
    auto nblocks = nelements / block_size;
    auto blocks_per_thread = nblocks / nthread;
    auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
    std::vector<std::thread> workers;
    for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
        auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
        auto thr_elems = thr_blocks * block_size; // number of elements for this thread
        auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
            if (typ == GGML_TYPE_F16) {
                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
            } else {
                qtype.dequantize_row_q(inbuf, outbuf, nels);
            }
        };
        workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
        in_buff_offs += thr_block_bytes;
        out_buff_offs += thr_elems;
    }
    for (auto & worker : workers) {
        worker.join();
    }
 }
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
    ggml_type quantized_type;
-    switch (ftype) {
+    llama_ftype ftype = params->ftype;
    int nthread = params->nthread;
    switch (params->ftype) {
        case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
        case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
        case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
        case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
        case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
        case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
        case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
 #ifdef GGML_USE_K_QUANTS
        // K-quants
        case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
        case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@ -2246,6 +2417,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_Q5_K_S:
        case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
        case LLAMA_FTYPE_MOSTLY_Q6_K:   quantized_type = GGML_TYPE_Q6_K; break;
 #endif
        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }
@ -2255,8 +2427,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
                                                                            /*vocab_only*/ false));
-    llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
+    llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
 #ifdef GGML_USE_K_QUANTS
    int n_attention_wv    = 0;
    int n_feed_forward_w2 = 0;
    for (auto& tensor : model_loader->tensors_map.tensors) {
@ -2270,6 +2443,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    int i_attention_wv = 0;
    int i_feed_forward_w2 = 0;
 #endif
    size_t total_size_org = 0;
    size_t total_size_new = 0;
@ -2295,11 +2469,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        // quantize only 2D tensors
        quantize &= (tensor.ne.size() == 2);
-
+        quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
-        // uncomment this to keep the output layer in FP16
+        quantize &= quantized_type != tensor.type;
        //if (tensor.name == "output.weight") {
        //    quantize = false;
        //}
        enum ggml_type new_type;
        void * new_data;
@ -2313,46 +2484,40 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
        } else {
            new_type = quantized_type;
-            // TODO: temporary disabled until Metal / OpenCL support is available
+#ifdef GGML_USE_K_QUANTS
-            //       ref: https://github.com/ggerganov/llama.cpp/issues/1711
+            if (tensor.name == "output.weight") {
-            //if (tensor.name == "output.weight") {
+               new_type = GGML_TYPE_Q6_K;
-            //    new_type = GGML_TYPE_Q6_K;
+            } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
            //}
            if (tensor.name.find("attention.wv.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                         (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
                         (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                ++i_attention_wv;
-            }
+            } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
            if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                         (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
                         (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                ++i_feed_forward_w2;
-            }
+            } else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
            if (tensor.name.find("attention.wo.weight") != std::string::npos) {
                if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
            }
 #endif
            float * f32_data;
            size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
            llama_buffer f32_conv_buf;
            if (tensor.type == GGML_TYPE_F32) {
                f32_data = (float *) tensor.data;
-            } else if (tensor.type == GGML_TYPE_F16) {
+            } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
-                f32_conv_buf.resize(nelements * sizeof(float));
+                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
                f32_data = (float *) f32_conv_buf.addr;
                const auto * f16_data = (const ggml_fp16_t *) tensor.data;
                for (size_t i = 0; i < nelements; i++) {
                    f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
                }
            } else {
-                throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)));
+                llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
                f32_data = (float *) f32_conv_buf.addr;
            }
            printf("quantizing .. ");
@ -2480,8 +2645,8 @@ struct llama_context * llama_init_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
-                params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
+                params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
        fprintf(stderr, "%s: failed to load model\n", __func__);
        llama_free(ctx);
@ -2490,7 +2655,7 @@ struct llama_context * llama_init_from_file(
    // reserve memory for context buffers
    if (!params.vocab_only) {
-        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
+        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
            return nullptr;
@ -2562,10 +2727,9 @@ void llama_free(struct llama_context * ctx) {
 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
-  enum llama_ftype   ftype,
+        const llama_model_quantize_params *params) {
        int          nthread) {
    try {
-        llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
+        llama_model_quantize_internal(fname_inp, fname_out, params);
        return 0;
    } catch (const std::exception & err) {
        fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
@ -3228,6 +3392,19 @@ int llama_n_embd(const struct llama_context * ctx) {
    return ctx->model.hparams.n_embd;
 }
 int llama_get_vocab(
        const struct llama_context * ctx,
        const char * * strings,
        float  * scores,
        int capacity) {
    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
    for (int i = 0; i<n; ++i) {
        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
        scores[i]  = ctx->vocab.id_to_token[i].score;
    }
    return n;
 }
 float * llama_get_logits(struct llama_context * ctx) {
    return ctx->logits.data();
 }
--- a/llama.h
+++ b/llama.h
@ -77,6 +77,7 @@ extern "C" {
        int n_gpu_layers;                      // number of layers to store in VRAM
        int main_gpu;                          // the GPU that is used for scratch and small tensors
        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
        bool low_vram;                         // if true, reduce VRAM usage at the cost of performance
        int seed;                              // RNG seed, -1 for random
        bool f16_kv;     // use fp16 for KV cache
@ -115,7 +116,16 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
    };
    // model quantization parameters
    typedef struct llama_model_quantize_params {
        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype   ftype;    // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
    } llama_model_quantize_params;
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();
@ -137,14 +147,11 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
    // TODO: not great API - very likely to change
    // Returns 0 on success
    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-      enum llama_ftype   ftype,
+            const llama_model_quantize_params * params);
            int          nthread);
    // Apply a LoRA adapter to a loaded model
    // path_base_model is the path to a higher quality model to use as a base for
@ -214,6 +221,14 @@ extern "C" {
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
    // Get the vocabulary as output parameters.
    // Returns number of results.
    LLAMA_API int llama_get_vocab(
            const struct llama_context * ctx,
                          const char * * strings,
                                 float * scores,
                                   int   capacity);
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <assert.h>
-#define MAX_NARGS 2
+#define MAX_NARGS 3
 #undef MIN
 #undef MAX
@ -1090,6 +1090,25 @@ int main(int argc, const char ** argv) {
            }
        }
        // cross_entropy_loss
        {
            const int nargs = 1;
            int64_t ne2[4];
            get_random_dims(ne2, 4);
            for (int ndims = 1; ndims <= 3; ++ndims) {
                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
                x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
                // finite differences regularly fails!
            }
        }
        // rope
        {
            const int nargs = 1;
@ -1124,6 +1143,45 @@ int main(int argc, const char ** argv) {
            }
        }
        // flash_attn
        {
            const int nargs = 3;
            int64_t ne2[4];
            get_random_dims(ne2, 4);
            int64_t D = ne2[0];
            int64_t N = ne2[1];
            int64_t M = ne2[2] + N;
            int64_t B = ne2[3];
            for (int masked = 0; masked <= 1; ++masked) {
                for (int ndims = 2; ndims <= 4; ++ndims) {
                    int64_t neq[4] = { D, N, B, ne[3] };
                    int64_t nek[4] = { D, M, B, ne[3] };
                    int64_t nev[4] = { M, D, B, ne[3] };
                    if (ndims == 2) {
                        neq[2] = 1; neq[3] = 1;
                        nek[2] = 1; nek[3] = 1;
                        nev[2] = 1; nev[3] = 1;
                    } else if (ndims == 3) {
                        neq[3] = 1;
                        nek[3] = 1;
                        nev[3] = 1;
                    }
                    x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
                    x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
                    x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
                    ggml_set_param(ctx0, x[0]);
                    ggml_set_param(ctx0, x[1]);
                    ggml_set_param(ctx0, x[2]);
                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
                    check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                }
            }
        }
        ggml_free(ctx0);
    }