From de280085e7917dbb7f5753de5842ff4455f82a81 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 23 Jul 2024 23:48:37 +0200
Subject: [PATCH 01/35] examples : Fix `llama-export-lora` example (#8607)

* fix export-lora example

* add more logging

* reject merging subset

* better check

* typo
---
 Makefile                             |   2 +-
 common/common.cpp                    |  19 +-
 common/common.h                      |   3 +-
 examples/export-lora/README.md       |  15 +-
 examples/export-lora/export-lora.cpp | 775 +++++++++++++--------------
 5 files changed, 378 insertions(+), 436 deletions(-)

diff --git a/Makefile b/Makefile
index 52b55dd89..58a93db1a 100644
--- a/Makefile
+++ b/Makefile
@@ -1322,7 +1322,7 @@ llama-finetune: examples/finetune/finetune.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
 llama-export-lora: examples/export-lora/export-lora.cpp \
-	$(OBJ_GGML) common/log.h
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
diff --git a/common/common.cpp b/common/common.cpp
index dbb724fbb..4c19132f1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
         return true;
     }
-    if (arg == "--lora-base") {
-        CHECK_ARG
-        params.lora_base = argv[i];
-        return true;
-    }
     if (arg == "--control-vector") {
         CHECK_ARG
         params.control_vectors.push_back({ 1.0f, argv[i], });
@@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         CHECK_ARG
         params.out_file = argv[i];
         params.cvector_outfile = argv[i];
+        params.lora_outfile = argv[i];
         return true;
     }
     if (arg == "-ofreq" || arg == "--output-frequency") {
@@ -1583,9 +1579,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
                                                                         "advanced option to override model metadata by key. may be specified multiple times.\n"
                                                                         "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
-    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
-    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
-    options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
+    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
     options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
                                                                         "note: this argument can be repeated to add multiple control vectors" });
     options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
@@ -1676,6 +1671,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
     options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });
 
+    options.push_back({ "export-lora" });
+    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
+    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
+    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
+    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
+
     printf("usage: %s [options]\n", argv[0]);
 
     for (const auto & o : options) {
@@ -3166,7 +3168,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
         }
         fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
     }
-    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
diff --git a/common/common.h b/common/common.h
index 184a53dc0..8240ff99b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -128,7 +128,6 @@ struct gpt_params {
 
     // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
-    std::string lora_base  = "";                              // base model path for the lora adapter
 
     std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
 
@@ -255,6 +254,8 @@ struct gpt_params {
     std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
 
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
+
+    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };
 
 void gpt_params_handle_hf_token(gpt_params & params);
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
index 1fb17feec..6d51f4b24 100644
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
 usage: llama-export-lora [options]
 
 options:
-  -h, --help                         show this help message and exit
-  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
-  -o FNAME, --model-out FNAME        path to save exported model (default '')
-  -l FNAME, --lora FNAME             apply LoRA adapter
-  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
-  -t N, --threads N                  number of threads to use during computation (default: 4)
+  -m,    --model                  model path from which to load base model (default '')
+         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
+         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
+  -t,    --threads N              number of threads to use during computation (default: 4)
+  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
 ```
 
 For example:
@@ -20,7 +19,7 @@ For example:
 ./bin/llama-export-lora \
     -m open-llama-3b-v2-q8_0.gguf \
     -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
 ```
 
-Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
+Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters.
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 16f27aa77..124ee167d 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,465 +1,406 @@
-
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 
+#include <map>
 #include <vector>
 #include <string>
 #include <thread>
+#include <fstream>
 
-struct lora_info {
-    std::string filename;
+static bool g_verbose = false;
+
+static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+}
+
+static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ ctx_ggml,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+    if (!ctx_gguf) {
+        throw std::runtime_error("failed to load input GGUF from " + fname);
+    }
+    return ctx_gguf;
+}
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+struct file_input {
+    struct ggml_context * ctx_meta = nullptr;
+    struct gguf_context * ctx_gguf = nullptr;
+    std::ifstream f_in;
+    std::map<std::string, ggml_tensor *> tensors;
+    float alpha;
     float scale;
+
+    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
+        if (!f_in.is_open()) {
+            throw std::runtime_error("failed to open input gguf from " + fname);
+        }
+
+        ctx_gguf = load_gguf(fname, &ctx_meta);
+        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
+        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
+
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
+            std::string name(cur->name);
+            tensors[name] = cur;
+            if (g_verbose) {
+                printf("%s: %s\n", __func__, cur->name);
+            }
+        }
+    }
+
+    ggml_tensor * get_tensor(std::string name) {
+        if (tensors.find(name) == tensors.end()) {
+            return nullptr;
+        }
+        return tensors[name];
+    }
+
+    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
+        if (tensors.find(name) == tensors.end()) {
+            throw std::runtime_error("cannot find tensor with name: " + name);
+        }
+        auto len = ggml_nbytes(tensors[name]);
+        if (buf.size() < len) {
+            buf.resize(len);
+        }
+        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
+        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+        f_in.seekg(offset);
+        f_in.read((char* )buf.data(), len);
+    }
+
+    ~file_input() {
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+    }
 };
 
-struct export_lora_params {
-    std::string fn_model_base;
-    std::string fn_model_out;
-    std::vector<struct lora_info> lora;
+struct lora_merge_ctx {
+    // input base model + adapters
+    file_input base_model;
+    std::vector<std::unique_ptr<file_input>> adapters;
+
+    // for computing merged tensor
     int n_threads;
-};
+    ggml_backend_t backend = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    std::vector<uint8_t> read_buf;
 
-struct lora_data {
-    struct lora_info     info;
-    std::vector<uint8_t> data;
-    struct ggml_context * ctx;
+    // output file
+    struct gguf_context * ctx_out;
+    struct ggml_context * ctx_out_ggml;
+    std::ofstream fout;
 
-    uint32_t lora_r;
-    uint32_t lora_alpha;
-};
+    lora_merge_ctx(
+            std::string & base_fname,
+            std::vector<std::tuple<std::string, float>> & lora_files,
+            std::string & outfile,
+            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
+        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
+            throw std::runtime_error("split model is not yet supported");
+        }
 
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
+        for (auto lora_inp : lora_files) {
+            auto fname = std::get<0>(lora_inp);
+            auto scale = std::get<1>(lora_inp);
+            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
+            check_metadata_lora(adapter.get());
+            adapters.push_back(std::move(adapter));
+        }
+
+        ctx_out = gguf_init_empty();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_out_ggml = ggml_init(params);
+        backend = ggml_backend_cpu_init();
+        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+
+    void check_metadata_lora(file_input * adapter) {
+        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
+        if (general_type != "adapter") {
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+
+        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
+        if (adapter_type != "lora") {
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+        }
+
+        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
+        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
+        if (general_arch_base != general_arch_lora) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+    }
+
+    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
+        if (t->type == GGML_TYPE_F32) {
+            return GGML_TYPE_F32;
         } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
+            return GGML_TYPE_F16;
         }
     }
 
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
+    void run_merge() {
+        // prepare metadata
+        gguf_set_kv(ctx_out, base_model.ctx_gguf);
+        // output is forced to f16 for now
+        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
 
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // check if all lora adapters have the same tensors
+        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
+        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
+        if (adapters.size() > 1) {
+            for (size_t i = 1; i < adapters.size(); ++i) {
+                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
+                    throw std::runtime_error(err_no_subset_adapter);
+                }
+                for (auto & it : adapters[i]->tensors) {
+                    if (adapters[0]->get_tensor(it.first) == nullptr) {
+                        throw std::runtime_error(err_no_subset_adapter);
+                    }
+                }
+            }
         }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
+
+        // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
+        std::vector<std::pair<struct ggml_tensor *, bool>> base_tensors;
+        for (auto & it : base_model.tensors) {
+            bool t_a = true;
+            bool t_b = true;
+            for (auto & adapter : adapters) {
+                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
+                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
+            }
+            auto base_tensor = it.second;
+            struct ggml_tensor * out_tensor;
+            if (!t_a && !t_b) {
+                // only copy
+                out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
+                ggml_set_name(out_tensor, base_tensor->name);
+                base_tensors.push_back(std::make_pair(out_tensor, false));
+            } else if (t_a && t_b) {
+                // need merging
+                out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
+                out_tensor->type = get_out_tensor_type(base_tensor);
+                ggml_set_name(out_tensor, base_tensor->name);
+                base_tensors.push_back(std::make_pair(out_tensor, true));
+            } else {
+                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
+            }
+            gguf_add_tensor(ctx_out, out_tensor);
         }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
+
+        // placeholder for the meta data
+        {
+            size_t meta_size = gguf_get_meta_size(ctx_out);
+            zeros(fout, meta_size);
         }
-    }
 
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // process base model tensors
+        size_t n_merged = 0;
+        for (auto & it : base_tensors) {
+            if (it.second) {
+                merge_tensor(it.first);
+                n_merged++;
+            } else {
+                copy_tensor(it.first);
+            }
         }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
+
+        // write output metadata
+        {
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+            gguf_get_meta_data(ctx_out, data.data());
+            fout.seekp(0);
+            fout.write((const char *)data.data(), data.size());
         }
+
+        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size());
     }
 
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
+    void copy_tensor(struct ggml_tensor * base) {
+        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+        size_t len = ggml_nbytes(base);
+        base_model.read_tensor_data(base->name, read_buf);
+        fout.write((char* )read_buf.data(), len);
+        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
     }
 
-    bool eof() {
-        return tell() >= size;
-    }
+    void merge_tensor(struct ggml_tensor * base) {
+        std::string name_base(base->name);
+        std::string name_lora_a = name_base + ".lora_a";
+        std::string name_lora_b = name_base + ".lora_b";
 
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
+        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+
+        // context for input tensor
+        std::vector<struct ggml_tensor *> inp_a(adapters.size());
+        std::vector<struct ggml_tensor *> inp_b(adapters.size());
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead()*(1+adapters.size()*2),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        struct ggml_context * ctx = ggml_init(params);
+
+        // alloc tensors
+        struct ggml_tensor * inp = ggml_dup_tensor(ctx, base);
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            auto t_a = adapters[i]->get_tensor(name_lora_a);
+            auto t_b = adapters[i]->get_tensor(name_lora_b);
+            inp_a[i] = ggml_dup_tensor(ctx, t_a);
+            inp_b[i] = ggml_dup_tensor(ctx, t_b);
         }
+        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+        // load data to backend buffer
+        base_model.read_tensor_data(name_base, read_buf);
+        ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp));
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            adapters[i]->read_tensor_data(name_lora_a, read_buf);
+            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
+            adapters[i]->read_tensor_data(name_lora_b, read_buf);
+            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
+        }
+
+        // build graph
+        struct ggml_cgraph * gf;
+        {
+            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+            static std::vector<uint8_t> buf(buf_size);
+            struct ggml_init_params params0 = {
+                /*.mem_size   =*/ buf_size,
+                /*.mem_buffer =*/ buf.data(),
+                /*.no_alloc   =*/ true,
+            };
+            struct ggml_context * ctx0 = ggml_init(params0);
+            gf = ggml_new_graph(ctx0);
+            struct ggml_tensor * cur = inp;
+            for (size_t i = 0; i < adapters.size(); ++i) {
+                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i]));
+                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]);
+                // scale
+                const float alpha = adapters[i]->alpha;
+                const float rank  = (float) inp_b[i]->ne[0];
+                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
+                delta = ggml_scale(ctx0, delta, scale);
+                cur = ggml_add(ctx0, cur, delta);
+                printf("%s :   + merging from adapter[%ld]\n", __func__, i);
+                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
+            }
+            cur = ggml_cast(ctx0, cur, get_out_tensor_type(base));
+            ggml_build_forward_expand(gf, cur);
+            ggml_free(ctx0);
+        }
+
+        // compute
+        {
+            ggml_gallocr_alloc_graph(allocr, gf);
+            ggml_backend_cpu_set_n_threads(backend, n_threads);
+            ggml_backend_graph_compute(backend, gf);
+        }
+
+        // write data to output file
+        {
+            auto result = gf->nodes[gf->n_nodes - 1];
+            size_t len = ggml_nbytes(result);
+            if (read_buf.size() < len) {
+                read_buf.resize(len);
+            }
+            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
+            fout.write((char* )read_buf.data(), len);
+            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+        }
+
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+    }
+
+    ~lora_merge_ctx() {
+        ggml_gallocr_free(allocr);
+        ggml_backend_free(backend);
+        gguf_free(ctx_out);
+        ggml_free(ctx_out_ggml);
     }
 };
 
-static struct export_lora_params get_default_export_lora_params() {
-    struct export_lora_params result;
-    result.fn_model_base = "";
-    result.fn_model_out  = "";
-    result.n_threads = GGML_DEFAULT_N_THREADS;
-    return result;
-}
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
 
-static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
-    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
-    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
-    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
-    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
-    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
-}
-
-static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct export_lora_params default_params = get_default_export_lora_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (arg == "-m" || arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "-o" || arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "-l" || arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            lora.scale = 1.0f;
-            params->lora.push_back(lora);
-        } else if (arg == "-s" || arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            lora.scale = std::stof(argv[i]);
-            params->lora.push_back(lora);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-            if (params->n_threads <= 0) {
-                params->n_threads = std::thread::hardware_concurrency();
-            }
-        } else if (arg == "-h" || arg == "--help") {
-            export_lora_print_usage(argc, argv, &default_params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
-            export_lora_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-
-    if (params->fn_model_base == default_params.fn_model_base) {
-        fprintf(stderr, "error: please specify a filename for model-base.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (params->fn_model_out == default_params.fn_model_out) {
-        fprintf(stderr, "error: please specify a filename for model-out.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    return true;
-}
-
-static void free_lora(struct lora_data * lora) {
-    if (lora->ctx != NULL) {
-        ggml_free(lora->ctx);
-    }
-    delete lora;
-}
-
-static struct lora_data * load_lora(struct lora_info * info) {
-    struct lora_data * result = new struct lora_data;
-    result->info = *info;
-    result->ctx = NULL;
-    result->lora_r     = 1;
-    result->lora_alpha = 1;
-
-    struct llama_file file(info->filename.c_str(), "rb");
-    if (file.fp == NULL) {
-        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
-            info->filename.c_str());
-        free_lora(result);
-        return NULL;
-    }
-
-    struct ggml_init_params params_ggml;
-    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
-    params_ggml.mem_buffer = NULL;
-    params_ggml.no_alloc   = true;
-    result->ctx = ggml_init(params_ggml);
-
-    uint32_t magic   = file.read_u32();
-    if (magic != LLAMA_FILE_MAGIC_GGLA) {
-        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
-    }
-    uint32_t version = file.read_u32();
-    if (version != 1) {
-        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
-    }
-    result->lora_r     = file.read_u32();
-    result->lora_alpha = file.read_u32();
-    // read tensor infos from file
-    std::vector<char> name_buf;
-    std::vector<struct ggml_tensor *> tensors;
-    std::vector<size_t> tensors_offset;
-    size_t total_nbytes_pad = 0;
-    while(!file.eof()) {
-        int64_t ne[4]   = {1,1,1,1};
-        uint32_t n_dims  = file.read_u32();
-        uint32_t namelen = file.read_u32();
-        uint32_t type    = file.read_u32();
-        for (uint32_t k = 0; k < n_dims; ++k) {
-            ne[k] = (int64_t)file.read_u32();
-        }
-        name_buf.clear();
-        name_buf.resize(namelen + 1, '\0');
-        file.read_raw(name_buf.data(), namelen);
-        file.seek((0-file.tell()) & 31, SEEK_CUR);
-        size_t offset = file.tell();
-        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
-        ggml_set_name(tensor, name_buf.data());
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        total_nbytes_pad += nbytes_pad;
-        tensors.push_back(tensor);
-        tensors_offset.push_back(offset);
-        file.seek(nbytes, SEEK_CUR);
-    }
-    // read tensor data
-    result->data.resize(total_nbytes_pad);
-    size_t data_offset = 0;
-    for (size_t i = 0; i < tensors.size(); ++i) {
-        struct ggml_tensor * tensor = tensors[i];
-        size_t offset     = tensors_offset[i];
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        file.seek(offset, SEEK_SET);
-        tensor->data = result->data.data() + data_offset;
-        file.read_raw(tensor->data, nbytes);
-        data_offset += nbytes_pad;
-    }
-    return result;
-}
-
-
-static struct ggml_cgraph * build_graph_lora(
-    struct ggml_context * ctx,
-    struct ggml_tensor * tensor,
-    struct ggml_tensor * lora_a,
-    struct ggml_tensor * lora_b,
-    float scaling
-) {
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
-    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, scaling);
-    }
-    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand (gf, res);
-    return gf;
-}
-
-static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
-    if (lora->ctx == NULL) {
-        return false;
-    }
-    std::string name = ggml_get_name(tensor);
-    std::string name_a = name + std::string(".loraA");
-    std::string name_b = name + std::string(".loraB");
-    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
-    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
-    if (lora_a == NULL || lora_b == NULL) {
-        return false;
-    }
-
-    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
-
-    struct ggml_init_params params;
-    params.mem_size   = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
-    params.mem_buffer = NULL;
-    params.no_alloc   = true;
-    struct ggml_context * ctx = NULL;
-    struct ggml_gallocr * alloc = NULL;
-    struct ggml_cgraph  * gf = NULL;
-
-    ctx   = ggml_init(params);
-    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
-
-    ggml_gallocr_alloc_graph(alloc, gf);
-
-    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> data_work;
-    data_work.resize(cplan.work_size);
-    cplan.work_data = data_work.data();
-
-    ggml_graph_compute(gf, &cplan);
-
-    ggml_gallocr_free(alloc);
-    ggml_free(ctx);
-    return true;
-}
-
-static void export_lora(struct export_lora_params * params) {
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < params->lora.size(); ++i) {
-        struct lora_data * lora = load_lora(&params->lora[i]);
-        if (lora != NULL) {
-            loras.push_back(lora);
-        }
-    }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-
-    // open input file
-    struct llama_file fin(params->fn_model_base.c_str(), "rb");
-    if (!fin.fp) {
-        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
-    }
-
-    // open base model gguf, read tensors without their data
-    struct ggml_context * ctx_in;
-    struct gguf_init_params params_gguf;
-    params_gguf.no_alloc = true;
-    params_gguf.ctx      = &ctx_in;
-    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
-
-    // create new gguf
-    struct gguf_context * gguf_out = gguf_init_empty();
-
-    // copy meta data from base model: kv and tensors
-    gguf_set_kv(gguf_out, gguf_in);
-    int n_tensors = gguf_get_n_tensors(gguf_in);
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-        gguf_add_tensor(gguf_out, tensor);
-    }
-
-    // create output file
-    struct llama_file fout(params->fn_model_out.c_str(), "wb");
-    if (!fout.fp) {
-        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
-    }
-
-    // write gguf meta data
-    std::vector<uint8_t> meta;
-    meta.resize(gguf_get_meta_size(gguf_out));
-    gguf_get_meta_data(gguf_out, meta.data());
-    fout.write_raw(meta.data(), meta.size());
-
-    std::vector<uint8_t> data;
-    std::vector<uint8_t> padding;
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-
-        // read tensor data
-        data.resize(ggml_nbytes(tensor));
-        tensor->data = data.data();
-        size_t offset = gguf_get_tensor_offset(gguf_in, i);
-        fin.seek(offset + meta.size(), SEEK_SET);
-        fin.read_raw(data.data(), data.size());
-
-        // apply all loras
-        for (size_t k = 0; k < loras.size(); ++k) {
-            apply_lora(tensor, loras[k], params->n_threads);
-        }
-
-        // write tensor data + padding
-        padding.clear();
-        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
-
-        GGML_ASSERT(fout.tell() == offset + meta.size());
-        // fout.seek(offset + meta.size(), SEEK_SET);
-        fout.write_raw(data.data(), data.size());
-        fout.write_raw(padding.data(), padding.size());
-
-        if (i % 2 == 0) {
-            printf(".");
-        }
-    }
+    printf("\nexample usage:\n");
+    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
+    printf("\nNOTE: output model is F16\n");
     printf("\n");
-
-    // close gguf
-    gguf_free(gguf_out);
-    gguf_free(gguf_in);
-
-    // free loras
-    for (size_t i = 0; i < loras.size(); ++i) {
-        free_lora(loras[i]);
-    }
 }
 
 int main(int argc, char ** argv) {
-    struct export_lora_params params = get_default_export_lora_params();
+    gpt_params params;
 
-    if (!export_lora_params_parse(argc, argv, &params)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
         return 1;
     }
 
-    export_lora(&params);
+    g_verbose = (params.verbosity == 1);
+    try {
+        lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
+        ctx.run_merge();
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s\n", err.what());
+        exit(EXIT_FAILURE);
+    }
+
+    printf("done, output file is %s\n", params.lora_outfile.c_str());
 
     return 0;
 }

From b115105f05e3372bc75b2a486c1930c365fd2846 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 24 Jul 2024 11:25:19 +0200
Subject: [PATCH 02/35] add llama_lora_adapter_clear (#8653)

---
 include/llama.h | 6 +++++-
 src/llama.cpp   | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index e68cd807e..413070d95 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -529,12 +529,16 @@ extern "C" {
             struct llama_lora_adapter * adapter,
             float scale);
 
-    // Remove a LoRA adapter from given context
+    // Remove a specific LoRA adapter from given context
     // Return -1 if the adapter is not present in the context
     LLAMA_API int32_t llama_lora_adapter_remove(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter);
 
+    // Remove all LoRA adapters from given context
+    LLAMA_API void llama_lora_adapter_clear(
+            struct llama_context * ctx);
+
     // Manually free a LoRA adapter
     // Note: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
diff --git a/src/llama.cpp b/src/llama.cpp
index 40c5e8e8d..04eaf6730 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16201,6 +16201,10 @@ int32_t llama_lora_adapter_remove(
     return -1;
 }
 
+void llama_lora_adapter_clear(struct llama_context * ctx) {
+    ctx->lora_adapters.clear();
+}
+
 void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
     delete adapter;
 }

From 79167d9e49aef9caa98e13ee7ca067ec9f88b4b5 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 24 Jul 2024 11:55:26 +0100
Subject: [PATCH 03/35] Re-add erroneously removed -fsycl from GGML_EXTRA_LIBS
 (#8667)

---
 ggml/src/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 12c440327..c6496c921 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -510,10 +510,10 @@ if (GGML_SYCL)
         set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
     else()
         if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
         elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} pthread m dl onemkl)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
         endif()
     endif()
 endif()

From 96952e7181929c6001b2bc69a33f240de731cc3a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 24 Jul 2024 13:48:46 +0200
Subject: [PATCH 04/35] llama : fix `llama_chat_format_single` for mistral
 (#8657)

* fix `llama_chat_format_single` for mistral

* fix typo

* use printf
---
 common/common.cpp            |  2 +-
 examples/main/main.cpp       |  1 +
 tests/test-chat-template.cpp | 30 ++++++++++++++++++++++++------
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 4c19132f1..ec44a0552 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2723,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
         const llama_chat_msg & new_msg,
         bool add_ass) {
     std::ostringstream ss;
-    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
+    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
     std::vector<llama_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index a0d817b1a..61e960ea2 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
     auto formatted = llama_chat_format_single(
         model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
+    LOG("formatted: %s\n", formatted.c_str());
     return formatted;
 }
 
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 6583dd0b2..46a7d3aea 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -1,4 +1,3 @@
-#include <iostream>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -133,13 +132,31 @@ int main(void) {
         );
         formatted_chat.resize(res);
         std::string output(formatted_chat.data(), formatted_chat.size());
-        std::cout << output << "\n-------------------------\n";
+        printf("%s\n", output.c_str());
+        printf("-------------------------\n");
         assert(output == expected);
     }
 
-    // test llama_chat_format_single
-    std::cout << "\n\n=== llama_chat_format_single ===\n\n";
+
+    // test llama_chat_format_single for system message
+    printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
     std::vector<llama_chat_msg> chat2;
+    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
+
+    auto fmt_sys = [&](std::string tmpl) {
+        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
+        printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
+        printf("-------------------------\n", output.c_str());
+        return output;
+    };
+    assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
+    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
+    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
+
+
+    // test llama_chat_format_single for user message
+    printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
     chat2.push_back({"system", "You are a helpful assistant"});
     chat2.push_back({"user", "Hello"});
     chat2.push_back({"assistant", "I am assistant"});
@@ -147,12 +164,13 @@ int main(void) {
 
     auto fmt_single = [&](std::string tmpl) {
         auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
-        std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
+        printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
+        printf("-------------------------\n", output.c_str());
         return output;
     };
     assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
     assert(fmt_single("llama2") == "[INST] How are you [/INST]");
-    assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
+    assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
     assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
 
     return 0;

From 3a7ac5300a7e8ebbe4a3eb5aff9dba11ed76ea61 Mon Sep 17 00:00:00 2001
From: Thorsten Sommer <SommerEngineering@users.noreply.github.com>
Date: Wed, 24 Jul 2024 14:52:30 +0200
Subject: [PATCH 05/35] readme : update UI list [no ci] (#8505)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7c233b5e1..b7b9bf588 100644
--- a/README.md
+++ b/README.md
@@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well.
 
 Unless otherwise noted these projects are open-source with permissive licensing:
 
+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)

From f19bf99c015d3d745143e8bb4f056e0ea015ad40 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 24 Jul 2024 14:36:00 +0100
Subject: [PATCH 06/35] Build Llama SYCL Intel with static libs (#8668)

Ensure SYCL CI builds both static & dynamic libs for testing purposes

Signed-off-by: Joe Todd <joe.todd@codeplay.com>
---
 .devops/llama-cli-intel.Dockerfile    | 4 +++-
 .devops/llama-server-intel.Dockerfile | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 2bf82bb58..79dba06a7 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
         echo "GGML_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    echo "Building with static libs" && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
     cmake --build build --config Release --target llama-cli
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
index eb9aba618..f525658dd 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
         echo "GGML_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
+    echo "Building with dynamic libs" && \
     cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-server
 

From 68504f0970db5a3602d176953690f503059906b1 Mon Sep 17 00:00:00 2001
From: MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
Date: Wed, 24 Jul 2024 12:48:00 -0400
Subject: [PATCH 07/35] readme : update games list (#8673)

Added link to game I made that depends on llama
---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index b7b9bf588..d0ae2efb9 100644
--- a/README.md
+++ b/README.md
@@ -182,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 
+**Games:**
+- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
+
 ## Demo
 
 <details>

From 8a4bad50a8ed24ed1e9df003521468dcc37320e8 Mon Sep 17 00:00:00 2001
From: Fan Shupei <dymarkfan@outlook.com>
Date: Thu, 25 Jul 2024 15:21:09 +0800
Subject: [PATCH 08/35] llama: use sliding window for phi3 (#8627)

* use sliding window for phi3

* fix typo, "data_swa" -> "data"

* [conver_hf_to_gguf.py] add phi3 sliding window
---
 convert_hf_to_gguf.py |  1 +
 src/llama.cpp         | 37 ++++++++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index dde4fa9c8..4087187c1 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2084,6 +2084,7 @@ class Phi3MiniModel(Model):
         self.gguf_writer.add_rope_dimension_count(rope_dims)
         self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
         self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
 
         # write rope scaling for long context (128k) model
         rope_scaling = self.find_hparam(['rope_scaling'], True)
diff --git a/src/llama.cpp b/src/llama.cpp
index 04eaf6730..9e502018d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4889,6 +4889,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_PHI3:
             {
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
                 switch (hparams.n_layer) {
@@ -10748,7 +10749,7 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
 
         for (int il = 0; il < n_layer; ++il) {
             auto residual = inpL;
@@ -10806,7 +10807,7 @@ struct llm_build_context {
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -14013,18 +14014,23 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         "causal attention is not supported by this model"
     );
 
-    if (lctx.inp_KQ_mask) {
+    if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
         if (cparams.causal_attn && !lctx.is_encoding) {
             const int64_t n_kv     = kv_self.n;
             const int64_t n_tokens = batch.n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
 
-            float * data     = (float *) lctx.inp_KQ_mask->data;
+            float * data     = nullptr;
             float * data_swa = nullptr;
 
+            if (lctx.inp_KQ_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+                data = (float *) lctx.inp_KQ_mask->data;
+            }
+
             if (lctx.inp_KQ_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
                 data_swa = (float *) lctx.inp_KQ_mask_swa->data;
             }
 
@@ -14047,7 +14053,10 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                                 f = 0.0f;
                             }
                         }
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+
+                        if (data) {
+                            data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+                        }
 
                         // may need to cut off old tokens for sliding window
                         if (data_swa) {
@@ -14059,9 +14068,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                     }
                 }
 
-                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                if (data) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
+                    }
+                }
+
+                if (data_swa) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
                     }
                 }
             }

From 4b0eff3df58d8d86e47348fb73d54da3194d416d Mon Sep 17 00:00:00 2001
From: Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
Date: Thu, 25 Jul 2024 13:43:27 +0530
Subject: [PATCH 09/35] docs : Quantum -> Quantized (#8666)

* docfix: imatrix readme, quantum models -> quantized models.

* docfix: server readme: quantum models -> quantized models.
---
 examples/imatrix/README.md | 2 +-
 examples/server/README.md  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 29602881a..bb5faec94 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/imatrix
 
-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
 More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
 
 ## Usage
diff --git a/examples/server/README.md b/examples/server/README.md
index ff4074517..33a2b95cc 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
 Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 
 **Features:**
- * LLM inference of F16 and quantum models on GPU and CPU
+ * LLM inference of F16 and quantized models on GPU and CPU
  * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
  * Parallel decoding with multi-user support
  * Continuous batching

From be6d7c079173d941b4f784500f9148f46cec2724 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 25 Jul 2024 10:39:04 +0200
Subject: [PATCH 10/35] examples : remove `finetune` and
 `train-text-from-scratch` (#8669)

* examples : remove finetune and train-text-from-scratch

* fix build

* update help message

* fix small typo for export-lora
---
 .devops/nix/apps.nix                          |    1 -
 .devops/tools.sh                              |    4 -
 Makefile                                      |   30 +-
 examples/CMakeLists.txt                       |    2 -
 examples/deprecation-warning/README.md        |    2 -
 examples/export-lora/README.md                |   12 +-
 examples/finetune/CMakeLists.txt              |    5 -
 examples/finetune/README.md                   |   90 -
 .../convert_finetune_checkpoint_to_gguf.py    |  487 -----
 examples/finetune/finetune.cpp                | 1862 -----------------
 examples/finetune/finetune.sh                 |   34 -
 .../train-text-from-scratch/CMakeLists.txt    |    5 -
 examples/train-text-from-scratch/README.md    |   27 -
 .../convert_train_checkpoint_to_gguf.py       |  499 -----
 .../train-text-from-scratch.cpp               | 1253 -----------
 15 files changed, 14 insertions(+), 4299 deletions(-)
 delete mode 100644 examples/finetune/CMakeLists.txt
 delete mode 100644 examples/finetune/README.md
 delete mode 100644 examples/finetune/convert_finetune_checkpoint_to_gguf.py
 delete mode 100644 examples/finetune/finetune.cpp
 delete mode 100644 examples/finetune/finetune.sh
 delete mode 100644 examples/train-text-from-scratch/CMakeLists.txt
 delete mode 100644 examples/train-text-from-scratch/README.md
 delete mode 100644 examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
 delete mode 100644 examples/train-text-from-scratch/train-text-from-scratch.cpp

diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix
index 897fce4d3..0ecf19fc5 100644
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -10,7 +10,6 @@
             "llama-embedding"
             "llama-server"
             "llama-quantize"
-            "llama-train-text-from-scratch"
           ];
           mkApp = name: {
             type = "app";
diff --git a/.devops/tools.sh b/.devops/tools.sh
index cf0e8f32d..24dcfd350 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
     ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
     ./llama-cli "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./llama-finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -36,8 +34,6 @@ else
     echo "              ex: --outtype f16 \"/models/7B/\" "
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
-    echo "              See documentation for finetune for command-line parameters"
     echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
     echo "  --server (-s): Run a model on the server"
diff --git a/Makefile b/Makefile
index 58a93db1a..8d2ccddc4 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,6 @@ BUILD_TARGETS = \
 	llama-embedding \
 	llama-eval-callback \
 	llama-export-lora \
-	llama-finetune \
 	llama-gbnf-validator \
 	llama-gguf \
 	llama-gguf-hash \
@@ -37,7 +36,6 @@ BUILD_TARGETS = \
 	llama-simple \
 	llama-speculative \
 	llama-tokenize \
-	llama-train-text-from-scratch \
 	llama-vdot \
 	llama-cvector-generator \
 	tests/test-c.o
@@ -64,13 +62,13 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-spm
 
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
-	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
+	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
 
 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
 #  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
-LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
 
 # Deprecation aliases
 ifdef LLAMA_CUBLAS
@@ -1296,11 +1294,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
 	$(OBJ_GGML) $(OBJ_LLAMA)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1316,11 +1309,6 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-finetune: examples/finetune/finetune.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 llama-export-lora: examples/export-lora/export-lora.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1578,7 +1566,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: main quantize perplexity embedding server finetune
+.PHONY: main quantize perplexity embedding server
 
 # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
 #  Eventually we will want to remove these target from building all the time.
@@ -1621,13 +1609,3 @@ ifneq (,$(wildcard embedding))
 	@echo "  Remove the 'embedding' binary to remove this warning."
 	@echo "#########"
 endif
-
-finetune: examples/deprecation-warning/deprecation-warning.cpp
-ifneq (,$(wildcard finetune))
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	@echo "#########"
-	@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
-	@echo "  Remove the 'finetune' binary to remove this warning."
-	@echo "#########"
-endif
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 155743639..67b3d2774 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,7 +21,6 @@ else()
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
     add_subdirectory(export-lora)
-    add_subdirectory(finetune)
     add_subdirectory(gbnf-validator)
     add_subdirectory(gguf-hash)
     add_subdirectory(gguf-split)
@@ -53,5 +52,4 @@ else()
     add_subdirectory(simple)
     add_subdirectory(speculative)
     add_subdirectory(tokenize)
-    add_subdirectory(train-text-from-scratch)
 endif()
diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md
index 1e20feb4a..59918ec2b 100644
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names.
 | server | llama-server |
 | llama-bench | llama-bench |
 | embedding | llama-embedding |
-| finetune | llama-finetune |
 | quantize | llama-quantize |
 | tokenize | llama-tokenize |
 | export-lora | llama-export-lora |
@@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names.
 | save-load-state | llama-save-load-state |
 | simple | llama-simple |
 | speculative | llama-speculative |
-| train-text-from-scratch | llama-train-text-from-scratch |
 | vdot | llama-vdot |
 | tests/test-c.o | tests/test-c.o |
 
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
index 6d51f4b24..91c33c34a 100644
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -19,7 +19,15 @@ For example:
 ./bin/llama-export-lora \
     -m open-llama-3b-v2-q8_0.gguf \
     -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
 ```
 
-Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters.
+Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
+
+```bash
+./bin/llama-export-lora \
+    -m your_base_model.gguf \
+    -o your_merged_model.gguf \
+    --lora-scaled lora_task_A.gguf 0.5 \
+    --lora-scaled lora_task_B.gguf 0.5
+```
diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt
deleted file mode 100644
index 64afe6ddc..000000000
--- a/examples/finetune/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-finetune)
-add_executable(${TARGET} finetune.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/finetune/README.md b/examples/finetune/README.md
deleted file mode 100644
index 1c27df053..000000000
--- a/examples/finetune/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# finetune
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# finetune LORA adapter
-./bin/llama-finetune \
-        --model-base open-llama-3b-v2-q8_0.gguf \
-        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
-        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
-        --train-data "shakespeare.txt" \
-        --save-every 10 \
-        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
-
-# predict
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-```
-
-**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
-The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
-So in above example after 10 iterations these files will be written:
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
-- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
-- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-After 10 more iterations:
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
-- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
-- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
-
-llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
-These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
-
-In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
-
-For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
-
-```bash
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
-  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
-  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
-```
-
-You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
-
-For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
-
-```bash
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
-  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
-```
-
-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
-
-Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
-If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
-
-The default LORA rank can be specified with `--lora-r N`.
-The LORA rank can be configured for each model tensor type separately with these command line options:
-
-```bash
-  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
-  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
-  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
-  --rank-out-norm N          LORA rank for output norm tensor (default 1)
-  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
-  --rank-out N               LORA rank for output tensor (default 4)
-  --rank-wq N                LORA rank for wq tensor (default 4)
-  --rank-wk N                LORA rank for wk tensor (default 4)
-  --rank-wv N                LORA rank for wv tensor (default 4)
-  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
-  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
-  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
-```
-
-The LORA rank of 'norm' tensors should always be 1.
-
-To see all available options use `llama-finetune --help`.
diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py
deleted file mode 100644
index 1b79d6995..000000000
--- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py
+++ /dev/null
@@ -1,487 +0,0 @@
-#!/usr/bin/env python3
-# finetune checkpoint --> gguf conversion
-
-import argparse
-import gguf
-import struct
-import numpy as np
-from pathlib import Path
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
-LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
-LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
-LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
-LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
-LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
-LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
-LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
-LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
-LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.prod(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        if tuple(ne) != tuple(self.ne):
-            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version != 1:
-            raise ValueError('Invalid version of optimization context in checkpoint file')
-
-        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-        self.adam_m  = Tensor('f', [self.nx])
-        self.adam_v  = Tensor('f', [self.nx])
-        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-        self.lbfgs_x    = Tensor('f', [self.nx])
-        self.lbfgs_xp   = Tensor('f', [self.nx])
-        self.lbfgs_g    = Tensor('f', [self.nx])
-        self.lbfgs_gp   = Tensor('f', [self.nx])
-        self.lbfgs_d    = Tensor('f', [self.nx])
-        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-        # forgot to save type in version 1:
-        # guess self.type from number of remaining bytes
-        size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                [self.adam_m, self.adam_v]
-                                +([self.adam_pf] if (self.past > 0) else [])])
-        size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                 self.lbfgs_lmal, self.lbfgs_lmys,
-                                 self.lbfgs_lms, self.lbfgs_lmy]
-                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
-        # due to alignment padding the size might not by exact
-        # but the difference in size for both types is significant,
-        # so we can just use whichever is closest
-        remaining = len(data) - offset
-        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-            self.type = 0
-        else:
-            self.type = 1
-
-        if self.type == 0:
-            offset = self.adam_m.load(data, offset)
-            offset = self.adam_v.load(data, offset)
-            offset = self.adam_pf.load(data,offset)
-
-            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        elif self.type == 1:
-            offset = self.lbfgs_x.load(data, offset)
-            offset = self.lbfgs_xp.load(data, offset)
-            offset = self.lbfgs_g.load(data, offset)
-            offset = self.lbfgs_gp.load(data, offset)
-            offset = self.lbfgs_d.load(data, offset)
-            offset = self.lbfgs_pf.load(data, offset)
-            offset = self.lbfgs_lmal.load(data, offset)
-            offset = self.lbfgs_lmys.load(data, offset)
-            offset = self.lbfgs_lms.load(data, offset)
-            offset = self.lbfgs_lmy.load(data, offset)
-
-            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError(f"Invalid optimizer type '{self.type}'")
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class LoraParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
-
-class ModelParams:
-    def __init__(self, n_ff = None):
-        self.n_ff = n_ff
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        if self.n_ff is None:
-            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-        else:
-            return self.n_ff
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None, suffix=".weight"):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
-
-class Layer:
-    def __init__(self, params, lora_params, bid):
-        self.bid = bid
-        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
-        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
-        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
-        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
-        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
-        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
-        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
-        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
-        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
-        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm_a.load(data, offset)
-        offset = self.att_norm_b.load(data, offset)
-        offset = self.wq_a.load(data, offset)
-        offset = self.wq_b.load(data, offset)
-        offset = self.wk_a.load(data, offset)
-        offset = self.wk_b.load(data, offset)
-        offset = self.wv_a.load(data, offset)
-        offset = self.wv_b.load(data, offset)
-        offset = self.wo_a.load(data, offset)
-        offset = self.wo_b.load(data, offset)
-        offset = self.ffn_norm_a.load(data, offset)
-        offset = self.ffn_norm_b.load(data, offset)
-        offset = self.w1_a.load(data, offset)
-        offset = self.w1_b.load(data, offset)
-        offset = self.w2_a.load(data, offset)
-        offset = self.w2_b.load(data, offset)
-        offset = self.w3_a.load(data, offset)
-        offset = self.w3_b.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
-        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
-        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
-        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
-        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
-        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
-        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
-        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
-        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
-        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
-        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
-        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
-        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
-        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
-        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
-        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
-        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
-        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
-
-class LoraModel:
-    def __init__(self, n_ff = None):
-        self.params = ModelParams(n_ff = n_ff)
-        self.lora_params = LoraParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-        offset = self.lora_params.load(data, offset)
-
-        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
-        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
-        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
-        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
-        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
-        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
-
-        offset = self.tok_embd_a.load(data, offset)
-        offset = self.tok_embd_b.load(data, offset)
-        offset = self.norm_a.load(data, offset)
-        offset = self.norm_b.load(data, offset)
-        offset = self.output_a.load(data, offset)
-        offset = self.output_b.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, self.lora_params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-        self.lora_params.save_gguf(gguf_writer)
-
-        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
-        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
-        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
-        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
-        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
-        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class LoraCheckpoint:
-    def __init__(self, n_ff = None):
-        self.model = LoraModel(n_ff = n_ff)
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcl':
-            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
-    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    print(cfg)
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = LoraCheckpoint(n_ff = cfg.ff)
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
deleted file mode 100644
index 71a4333ee..000000000
--- a/examples/finetune/finetune.cpp
+++ /dev/null
@@ -1,1862 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "llama.h"
-#include "common.h"
-#include "train.h"
-#include <vector>
-#include <cstring>
-#include <ctime>
-#include <algorithm>
-#include <string>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct my_llama_hparams {
-    uint32_t n_vocab    = 32000;
-    uint32_t n_ctx      = 512;
-    uint32_t n_embd     = 4096;
-    uint32_t n_ff       = 11008;
-    uint32_t n_head     = 32;
-    uint32_t n_head_kv  = 32;
-    uint32_t n_layer    = 32;
-
-    // float f_norm_eps     = 1e-5f; // falcon
-    float f_norm_rms_eps = 1e-5f; // llama
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-
-    uint32_t n_gqa() const {
-        return n_head/n_head_kv;
-    }
-
-    uint32_t n_embd_head() const {
-        return n_embd/n_head;
-    }
-
-    uint32_t n_embd_gqa() const {
-        return n_embd/n_gqa();
-    }
-
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(other));
-    }
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-};
-
-struct my_llama_model {
-    struct my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-};
-
-struct my_llama_lora_hparams {
-    uint32_t lora_r = 1;
-    uint32_t lora_alpha = 1;
-    uint32_t n_rank_attention_norm = 1;
-    uint32_t n_rank_wq = 4;
-    uint32_t n_rank_wk = 4;
-    uint32_t n_rank_wv = 4;
-    uint32_t n_rank_wo = 4;
-    uint32_t n_rank_ffn_norm = 1;
-    uint32_t n_rank_ffn_gate = 4;
-    uint32_t n_rank_ffn_down = 4;
-    uint32_t n_rank_ffn_up = 4;
-    uint32_t n_rank_tok_embeddings = 4;
-    uint32_t n_rank_norm = 1;
-    uint32_t n_rank_output = 4;
-
-    bool operator!=(const my_llama_lora_hparams& other) const {
-        return memcmp(this, &other, sizeof(other));
-    }
-};
-
-struct my_llama_lora_layer {
-    // normalization
-    struct ggml_tensor * attention_norm_a;
-    struct ggml_tensor * attention_norm_b;
-
-    // attention
-    struct ggml_tensor * wq_a;
-    struct ggml_tensor * wq_b;
-    struct ggml_tensor * wk_a;
-    struct ggml_tensor * wk_b;
-    struct ggml_tensor * wv_a;
-    struct ggml_tensor * wv_b;
-    struct ggml_tensor * wo_a;
-    struct ggml_tensor * wo_b;
-
-    // normalization
-    struct ggml_tensor * ffn_norm_a;
-    struct ggml_tensor * ffn_norm_b;
-
-    // ff
-    struct ggml_tensor * ffn_gate_a;
-    struct ggml_tensor * ffn_gate_b;
-    struct ggml_tensor * ffn_down_a;
-    struct ggml_tensor * ffn_down_b;
-    struct ggml_tensor * ffn_up_a;
-    struct ggml_tensor * ffn_up_b;
-};
-
-struct my_llama_lora {
-    struct ggml_context * ctx = NULL;
-    ggml_backend_buffer_t data;
-
-    my_llama_lora_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings_a;
-    struct ggml_tensor * tok_embeddings_b;
-
-    struct ggml_tensor * norm_a;
-    struct ggml_tensor * norm_b;
-    struct ggml_tensor * output_a;
-    struct ggml_tensor * output_b;
-
-    std::vector<my_llama_lora_layer> layers;
-};
-
-// gguf constants
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
-
-static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
-static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up";
-
-// gguf constants (sync with gguf.py)
-
-static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV     = "%s.attention.head_count_kv";
-static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-static const char * LLM_TENSOR_OUTPUT        = "output";
-static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
-    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
-    printf("%s: n_head                : %u\n", __func__, params->n_head);
-    printf("%s: n_head_kv             : %u\n", __func__, params->n_head_kv);
-    printf("%s: n_layer               : %u\n", __func__, params->n_layer);
-    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
-    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
-    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
-}
-
-static void print_lora_params(struct my_llama_lora_hparams * params) {
-    printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm);
-    printf("%s: n_rank_wq             : %u\n", __func__, params->n_rank_wq);
-    printf("%s: n_rank_wk             : %u\n", __func__, params->n_rank_wk);
-    printf("%s: n_rank_wv             : %u\n", __func__, params->n_rank_wv);
-    printf("%s: n_rank_wo             : %u\n", __func__, params->n_rank_wo);
-    printf("%s: n_rank_ffn_norm       : %u\n", __func__, params->n_rank_ffn_norm);
-    printf("%s: n_rank_ffn_gate       : %u\n", __func__, params->n_rank_ffn_gate);
-    printf("%s: n_rank_ffn_down       : %u\n", __func__, params->n_rank_ffn_down);
-    printf("%s: n_rank_ffn_up         : %u\n", __func__, params->n_rank_ffn_up);
-    printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
-    printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
-    printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
-}
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-}
-
-static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) {
-    std::string arch;
-
-    GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    if (expected_arch != NULL) {
-        if (arch != expected_arch) {
-            printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch);
-        }
-        GGML_ASSERT(arch == expected_arch);
-    }
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [&arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
-        return keybuf.data();
-    };
-
-    GGUF_GET_KEY(ctx, hparams->n_embd,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ctx,          gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ff,           gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_head,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(ctx, hparams->n_layer,        gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_BLOCK_COUNT));
-
-    // n_head_kv is optional, default to n_head
-    hparams->n_head_kv = hparams->n_head;
-    GGUF_GET_KEY(ctx, hparams->n_head_kv,      gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(ctx, rope_freq_scale,         gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        hparams->rope_freq_scale = 1.0f / rope_freq_scale;
-    }
-}
-
-static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
-    auto & hparams = model->hparams;
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-
-    // get parameters directly from gguf file
-    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
-        struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
-
-        load_model_hparams_gguf(mctx, &hparams, "llama");
-
-        gguf_free(mctx);
-    }
-    hparams.n_vocab = llama_n_vocab(input);
-    hparams.n_ctx = n_ctx;
-
-    // get tensors from llama_model (possibly mmapped)
-    model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
-    model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
-    model->output         = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
-
-    assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab);
-    assert_shape_1d(model->norm,           hparams.n_embd);
-    assert_shape_2d(model->output,         hparams.n_embd, hparams.n_vocab);
-
-    model->layers.resize(hparams.n_layer);
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i));
-        layer.wq             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i));
-        layer.wk             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i));
-        layer.wv             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
-        layer.wo             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
-        layer.ffn_norm       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
-        layer.ffn_gate       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
-        layer.ffn_down       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
-        layer.ffn_up         = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
-
-        assert_shape_1d(layer.attention_norm, hparams.n_embd);
-        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
-        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd_gqa());
-        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
-        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
-        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
-        assert_shape_2d(layer.ffn_gate,       hparams.n_embd, hparams.n_ff);
-        assert_shape_2d(layer.ffn_down,       hparams.n_ff,   hparams.n_embd);
-        assert_shape_2d(layer.ffn_up,         hparams.n_embd, hparams.n_ff);
-    }
-}
-
-static void set_param_lora(struct my_llama_lora * lora) {
-    const uint32_t n_layer = lora->layers.size();
-
-    struct ggml_context* ctx = lora->ctx;
-
-    ggml_set_param(ctx, lora->tok_embeddings_a);
-    ggml_set_param(ctx, lora->tok_embeddings_b);
-    ggml_set_param(ctx, lora->norm_a);
-    ggml_set_param(ctx, lora->norm_b);
-    ggml_set_param(ctx, lora->output_a);
-    ggml_set_param(ctx, lora->output_b);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm_a);
-        ggml_set_param(ctx, layer.attention_norm_b);
-        ggml_set_param(ctx, layer.wq_a);
-        ggml_set_param(ctx, layer.wq_b);
-        ggml_set_param(ctx, layer.wk_a);
-        ggml_set_param(ctx, layer.wk_b);
-        ggml_set_param(ctx, layer.wv_a);
-        ggml_set_param(ctx, layer.wv_b);
-        ggml_set_param(ctx, layer.wo_a);
-        ggml_set_param(ctx, layer.wo_b);
-        ggml_set_param(ctx, layer.ffn_norm_a);
-        ggml_set_param(ctx, layer.ffn_norm_b);
-        ggml_set_param(ctx, layer.ffn_gate_a);
-        ggml_set_param(ctx, layer.ffn_gate_b);
-        ggml_set_param(ctx, layer.ffn_down_a);
-        ggml_set_param(ctx, layer.ffn_down_b);
-        ggml_set_param(ctx, layer.ffn_up_a);
-        ggml_set_param(ctx, layer.ffn_up_b);
-    }
-}
-
-static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
-    const auto & lparams = lora->hparams;
-
-    const uint32_t n_embd     = model->hparams.n_embd;
-    const uint32_t n_embd_gqa = model->hparams.n_embd_gqa();
-    const uint32_t n_layer    = model->hparams.n_layer;
-    const uint32_t n_vocab    = model->hparams.n_vocab;
-    const uint32_t n_ff       = model->hparams.n_ff;
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
-        return tn_buf.data();
-    };
-
-    // context for lora tensors without their data
-    struct ggml_init_params ctx_lora_params;
-    ctx_lora_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
-    ctx_lora_params.mem_buffer = NULL;
-    ctx_lora_params.no_alloc   = true;
-
-    struct ggml_context * ctx = ggml_init(ctx_lora_params);
-    lora->ctx = ctx;
-
-    lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd);
-    lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab);
-    lora->norm_a           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd);
-    lora->norm_b           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1);
-    lora->output_a         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd);
-    lora->output_b         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab);
-
-    ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_a"));
-    ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_b"));
-    ggml_set_name(lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a"));
-    ggml_set_name(lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b"));
-    ggml_set_name(lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_a"));
-    ggml_set_name(lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_b"));
-
-    lora->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-
-        layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd);
-        layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1);
-
-        layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
-        layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
-        layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
-        layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa);
-        layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
-        layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa);
-        layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
-        layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
-
-        layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
-        layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
-
-        layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
-        layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
-        layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
-        layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
-        layer.ffn_up_a   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_embd);
-        layer.ffn_up_b   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_ff);
-
-        ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
-        ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
-        ggml_set_name(layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_a", i));
-        ggml_set_name(layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_b", i));
-        ggml_set_name(layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_a", i));
-        ggml_set_name(layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_b", i));
-        ggml_set_name(layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_a", i));
-        ggml_set_name(layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_b", i));
-        ggml_set_name(layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_a", i));
-        ggml_set_name(layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
-    }
-
-    set_param_lora(lora);
-
-    // allocate data for lora tensors
-    lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-}
-
-static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
-    const uint32_t n_layer = lora->layers.size();
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(lora->tok_embeddings_a, rnd);
-    ggml_set_zero(lora->tok_embeddings_b);
-    randomize_tensor_normal(lora->norm_a,           rnd);
-    ggml_set_zero(lora->norm_b);
-    randomize_tensor_normal(lora->output_a,         rnd);
-    ggml_set_zero(lora->output_b);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-        randomize_tensor_normal(layer.attention_norm_a, rnd);
-        ggml_set_zero(layer.attention_norm_b);
-
-        randomize_tensor_normal(layer.wq_a, rnd);
-        ggml_set_zero(layer.wq_b);
-        randomize_tensor_normal(layer.wk_a, rnd);
-        ggml_set_zero(layer.wk_b);
-        randomize_tensor_normal(layer.wv_a, rnd);
-        ggml_set_zero(layer.wv_b);
-        randomize_tensor_normal(layer.wo_a, rnd);
-        ggml_set_zero(layer.wo_b);
-
-        randomize_tensor_normal(layer.ffn_norm_a, rnd);
-        ggml_set_zero(layer.ffn_norm_b);
-
-        randomize_tensor_normal(layer.ffn_gate_a, rnd);
-        ggml_set_zero(layer.ffn_gate_b);
-        randomize_tensor_normal(layer.ffn_down_a, rnd);
-        ggml_set_zero(layer.ffn_down_b);
-        randomize_tensor_normal(layer.ffn_up_a, rnd);
-        ggml_set_zero(layer.ffn_up_b);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static struct ggml_tensor * llama_build_lora_finetune_graphs(
-        struct my_llama_model * model,
-        struct my_llama_lora  * lora,
-        ggml_gallocr_t          alloc,
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        const  int              n_tokens,
-        const  int              n_batch,
-        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing,
-        const  bool             measure_only) {
-
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
-    const int n_past = 0;
-    const int N = n_tokens;
-    const auto & hparams  = model->hparams;
-    const int n_ctx       = hparams.n_ctx;
-    const int n_vocab     = hparams.n_vocab;
-    const int n_embd      = hparams.n_embd;
-    const int n_layer     = hparams.n_layer;
-    const int n_head      = hparams.n_head;
-    const int n_head_kv   = hparams.n_head_kv;
-    const int n_ff        = hparams.n_ff;
-    const int n_rot       = hparams.n_embd_head();
-    const int n_embd_head = hparams.n_embd_head();
-    const int n_embd_gqa  = hparams.n_embd_gqa();
-
-    const float rms_norm_eps    = hparams.f_norm_rms_eps;
-    const float rope_freq_base  = hparams.rope_freq_base;
-    const float rope_freq_scale = hparams.rope_freq_scale;
-
-    GGML_ASSERT((size_t) n_layer == lora->layers.size());
-
-    auto set_name = [](struct ggml_tensor * t, const char * n) {
-        ggml_set_name(t, n);
-        if (t->grad) {
-            ggml_format_name(t->grad, "%s->grad", n);
-        }
-    };
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_set_input(KQ_pos);
-
-    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
-                (struct ggml_tensor * t) -> struct ggml_tensor * {
-        // not capturing these, to silcence warnings
-        const int rope_mode = 0;
-
-        return ggml_rope_ext(ctx,
-            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
-            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-        );
-    };
-
-    set_name(tokens_input, "tokens_input");
-    set_name(targets,      "targets");
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-
-    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
-            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
-        } else if (a->type == GGML_TYPE_F32) {
-            return ggml_add(ctx, a, b);
-        } else {
-            die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n",
-                __func__, ggml_type_name(a->type));
-        }
-    };
-
-    struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b));
-    struct ggml_tensor * norm           = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b));
-    struct ggml_tensor * output         = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b));
-
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00);        set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    struct ggml_tensor * cur = t01;
-
-    std::vector<struct ggml_tensor *> checkpoints;
-    if (enable_checkpointing) {
-        checkpoints.push_back(tokens_input);
-        checkpoints.push_back(targets);
-        checkpoints.push_back(t00);
-        checkpoints.push_back(t01);
-    }
-
-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        struct my_llama_lora_layer & llayer = lora->layers[il];
-
-        struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
-        struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
-        struct ggml_tensor * wq       = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
-        struct ggml_tensor * wk       = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
-        struct ggml_tensor * wv       = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
-        struct ggml_tensor * wo       = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
-        struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
-        struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
-        struct ggml_tensor * ffn_up   = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
-
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                       set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                     set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                                set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                 set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd_head, n_head, N, n_batch);    set_name(t06, "t06");     assert_shape_4d(t06, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = rope              (t06);                                          set_name(t07, "t07");     assert_shape_4d(t07, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                 set_name(t08, "t08");     assert_shape_2d(t08, n_embd_gqa, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch);
-        struct ggml_tensor * t10 = rope              (t09);                                          set_name(t10, "t10");     assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch);
-
-        struct ggml_tensor * t11;
-        if (ggml_is_quantized(wv->type)) {
-            struct ggml_tensor * t11_1 = ggml_mul_mat  (ctx, wv, t04);                               set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch);
-            struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1);                                 set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa);
-                                 t11   = ggml_cont     (ctx, t11_2);                                 set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
-        } else {
-                                 t11   = ggml_mul_mat  (ctx, t04, wv);                               set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
-        }
-
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                         set_name(t13, "t13");     assert_shape_4d(t13, n_embd_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                         set_name(t14, "t14");     assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
-        struct ggml_tensor * t16;
-        if (enable_flash_attn) {
-            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
-            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
-        } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);             set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                     set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                     set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
-        }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                         set_name(t17, "t17");     assert_shape_4d(t17, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                     set_name(t18, "t18");     assert_shape_4d(t18, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                  set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, wo, t19);                                 set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                                set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                       set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                           set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                                set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, ffn_up, t24);                             set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, ffn_gate, t24);                           set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                     set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                                set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, ffn_down, t28);                           set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                                set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
-        cur = t30;
-        if (enable_checkpointing) {
-            checkpoints.push_back(cur);
-        }
-    }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                    set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, norm, t31);                            set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                             set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, output, t33);                          set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);             set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                         set_name(t36, "t36");     assert_shape_1d(t36, 1);
-
-    if (enable_checkpointing) {
-        checkpoints.push_back(t31);
-        checkpoints.push_back(t32);
-        checkpoints.push_back(t33);
-        checkpoints.push_back(t34);
-        checkpoints.push_back(t35);
-        checkpoints.push_back(t36);
-    }
-
-    ggml_build_forward_expand(gf, t36);
-
-    if (enable_checkpointing) {
-        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
-    } else {
-        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, true);
-    }
-
-    GGML_ASSERT(alloc != NULL);
-
-    // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
-    int n_leafs_before = gb->n_leafs;
-    int n_nodes_before = gb->n_nodes;
-
-    // output tensors
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
-    // input gradient
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
-    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-    ggml_set_input(t36->grad);
-    // KQ_pos
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
-
-    // make sure base model tensors data cannot be used in viewable operations
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
-    }
-
-    // allocating checkpoints in one block to reduce memory fragmentation
-    // note: they will be freed in reverse order
-    for (unsigned int i = 0; i < checkpoints.size(); ++i) {
-        if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-            ggml_set_input(checkpoints[i]);
-        }
-    }
-
-    if (measure_only) {
-        ggml_gallocr_reserve(alloc, gb);
-    } else {
-        ggml_gallocr_alloc_graph(alloc, gb);
-
-        // set KQ_pos
-        {
-            int * data = (int *) KQ_pos->data;
-            for (int i = 0; i < N; ++i) {
-                data[i] = n_past + i;
-            }
-        }
-    }
-
-    // remove the additional nodes and leafs
-    for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
-        gb->leafs[i] = NULL;
-    }
-    for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
-        gb->nodes[i] = NULL;
-    }
-    gb->n_leafs = n_leafs_before;
-    gb->n_nodes = n_nodes_before;
-
-    *logits = t35;
-    return t36;
-}
-
-static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-
-    std::string arch;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-
-    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    GGML_ASSERT(arch == "llama");
-
-    uint32_t ftype_u;
-    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
-    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
-
-    struct my_llama_hparams hparams;
-    load_model_hparams_gguf(fctx, &hparams, arch.c_str());
-
-    // parameters that define tensor shapes must match
-    GGML_ASSERT(hparams.n_embd    == model->hparams.n_embd);
-    GGML_ASSERT(hparams.n_ff      == model->hparams.n_ff);
-    GGML_ASSERT(hparams.n_head    == model->hparams.n_head);
-    GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv);
-    GGML_ASSERT(hparams.n_layer   == model->hparams.n_layer);
-
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm,           gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_output,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
-
-    init_lora(model, lora);
-
-    copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a));
-    copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b));
-    copy_tensor_by_name(lora->norm_a,           f_ggml_ctx, ggml_get_name(lora->norm_a));
-    copy_tensor_by_name(lora->norm_b,           f_ggml_ctx, ggml_get_name(lora->norm_b));
-    copy_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
-    copy_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
-        copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
-        copy_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
-        copy_tensor_by_name(layer.wq_b,             f_ggml_ctx, ggml_get_name(layer.wq_b));
-        copy_tensor_by_name(layer.wk_a,             f_ggml_ctx, ggml_get_name(layer.wk_a));
-        copy_tensor_by_name(layer.wk_b,             f_ggml_ctx, ggml_get_name(layer.wk_b));
-        copy_tensor_by_name(layer.wv_a,             f_ggml_ctx, ggml_get_name(layer.wv_a));
-        copy_tensor_by_name(layer.wv_b,             f_ggml_ctx, ggml_get_name(layer.wv_b));
-        copy_tensor_by_name(layer.wo_a,             f_ggml_ctx, ggml_get_name(layer.wo_a));
-        copy_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
-        copy_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
-        copy_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
-        copy_tensor_by_name(layer.ffn_gate_a,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
-        copy_tensor_by_name(layer.ffn_gate_b,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
-        copy_tensor_by_name(layer.ffn_down_a,       f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
-        copy_tensor_by_name(layer.ffn_down_b,       f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
-        copy_tensor_by_name(layer.ffn_up_a,         f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
-        copy_tensor_by_name(layer.ffn_up_b,         f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
-    }
-}
-
-static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    const char * arch = "llama";
-    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch);
-        return keybuf.data();
-    };
-
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
-    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
-
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx);
-    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
-    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV),     model->hparams.n_head_kv);
-    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_embd_head());
-    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           model->hparams.rope_freq_scale);
-
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,   lora->hparams.n_rank_tok_embeddings);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM,  lora->hparams.n_rank_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT,       lora->hparams.n_rank_output);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,    lora->hparams.n_rank_attention_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q,       lora->hparams.n_rank_wq);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K,       lora->hparams.n_rank_wk);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V,       lora->hparams.n_rank_wv);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,     lora->hparams.n_rank_wo);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM,     lora->hparams.n_rank_ffn_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_ffn_gate);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_ffn_down);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_ffn_up);
-
-    gguf_add_tensor(fctx, lora->tok_embeddings_a);
-    gguf_add_tensor(fctx, lora->tok_embeddings_b);
-    gguf_add_tensor(fctx, lora->norm_a);
-    gguf_add_tensor(fctx, lora->norm_b);
-    gguf_add_tensor(fctx, lora->output_a);
-    gguf_add_tensor(fctx, lora->output_b);
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-
-        gguf_add_tensor(fctx, layer.attention_norm_a);
-        gguf_add_tensor(fctx, layer.attention_norm_b);
-        gguf_add_tensor(fctx, layer.wq_a);
-        gguf_add_tensor(fctx, layer.wq_b);
-        gguf_add_tensor(fctx, layer.wk_a);
-        gguf_add_tensor(fctx, layer.wk_b);
-        gguf_add_tensor(fctx, layer.wv_a);
-        gguf_add_tensor(fctx, layer.wv_b);
-        gguf_add_tensor(fctx, layer.wo_a);
-        gguf_add_tensor(fctx, layer.wo_b);
-        gguf_add_tensor(fctx, layer.ffn_norm_a);
-        gguf_add_tensor(fctx, layer.ffn_norm_b);
-        gguf_add_tensor(fctx, layer.ffn_gate_a);
-        gguf_add_tensor(fctx, layer.ffn_gate_b);
-        gguf_add_tensor(fctx, layer.ffn_down_a);
-        gguf_add_tensor(fctx, layer.ffn_down_b);
-        gguf_add_tensor(fctx, layer.ffn_up_a);
-        gguf_add_tensor(fctx, layer.ffn_up_b);
-    }
-}
-
-static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
-    GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-
-    load_train_state_gguf(fctx, f_ggml_ctx, train);
-    load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
-}
-
-static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-    save_llama_lora_gguf(fctx, model, lora);
-    save_train_state_gguf(fctx, train);
-}
-
-static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
-    struct gguf_context * fctx = gguf_init_from_file(filename, params);
-    if (fctx == NULL) {
-        return false;
-    }
-
-    load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train);
-
-    gguf_free(fctx);
-    return true;
-}
-
-static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_checkpoint_lora_gguf(fctx, model, lora, train);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
-    if (tensor == NULL) {
-        file->write_u32(0);
-        file->write_u32(0);
-        file->write_u32(GGML_TYPE_F32);
-        file->seek((0-file->tell()) & 31, SEEK_CUR);
-        return;
-    }
-    if (name == NULL) {
-        name = ggml_get_name(tensor);
-    }
-    uint32_t name_len = strlen(name);
-    uint32_t nd = ggml_n_dims(tensor);
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-                       (uint32_t)tensor->ne[1],
-                       (uint32_t)tensor->ne[2],
-                       (uint32_t)tensor->ne[3] };
-    file->write_u32(nd);
-    file->write_u32(name_len);
-    file->write_u32(tensor->type);
-    file->write_raw(ne, sizeof(ne[0]) * nd);
-    file->write_raw(name, name_len);
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->write_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
-        return;
-    }
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-
-    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
-        return tn_buf.data();
-    };
-
-    auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
-        return tn_buf.data();
-    };
-
-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
-    file.write_u32(1); // version
-    // write_hparams
-    file.write_u32(lora->hparams.lora_r);
-    file.write_u32(lora->hparams.lora_alpha);
-    // write tensors
-    write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraA"));
-    write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraB"));
-    write_tensor(&file, lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA"));
-    write_tensor(&file, lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB"));
-    write_tensor(&file, lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraA"));
-    write_tensor(&file, lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraB"));
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA"));
-        write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB"));
-        write_tensor(&file, layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraA"));
-        write_tensor(&file, layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
-    }
-}
-
-struct train_params {
-    struct train_params_common common;
-
-    const char * fn_model_base;
-    const char * fn_lora_out;
-
-    bool only_write_lora;
-
-    float f_norm_rms_eps;
-    float rope_freq_base;
-    float rope_freq_scale;
-
-    bool custom_f_norm_rms_eps;
-    bool custom_rope_freq_base;
-    bool custom_rope_freq_scale;
-
-    int32_t lora_r;
-    int32_t lora_alpha;
-    bool custom_lora_alpha;
-
-    uint32_t n_rank_attention_norm;
-    uint32_t n_rank_wq;
-    uint32_t n_rank_wk;
-    uint32_t n_rank_wv;
-    uint32_t n_rank_wo;
-    uint32_t n_rank_ffn_norm;
-    uint32_t n_rank_ffn_gate;
-    uint32_t n_rank_ffn_down;
-    uint32_t n_rank_ffn_up;
-    uint32_t n_rank_tok_embeddings;
-    uint32_t n_rank_norm;
-    uint32_t n_rank_output;
-
-    bool custom_n_rank_attention_norm;
-    bool custom_n_rank_wq;
-    bool custom_n_rank_wk;
-    bool custom_n_rank_wv;
-    bool custom_n_rank_wo;
-    bool custom_n_rank_ffn_norm;
-    bool custom_n_rank_ffn_gate;
-    bool custom_n_rank_ffn_down;
-    bool custom_n_rank_ffn_up;
-    bool custom_n_rank_tok_embeddings;
-    bool custom_n_rank_norm;
-    bool custom_n_rank_output;
-};
-
-static struct train_params get_default_train_params() {
-    struct train_params params;
-    params.common = get_default_train_params_common();
-    params.fn_model_base     = "";
-    params.fn_lora_out       = "ggml-lora-ITERATION-f32.gguf";
-
-    params.only_write_lora = false;
-
-    params.f_norm_rms_eps  = 1e-5f;
-    params.rope_freq_base  = 10000.0f;
-    params.rope_freq_scale = 1.0f;
-
-    params.custom_f_norm_rms_eps  = false;
-    params.custom_rope_freq_base  = false;
-    params.custom_rope_freq_scale = false;
-
-    params.lora_r      = 4;
-    params.lora_alpha  = 4;
-    params.custom_lora_alpha = false;
-
-    params.n_rank_attention_norm = 1;
-    params.n_rank_wq             = 4;
-    params.n_rank_wk             = 4;
-    params.n_rank_wv             = 4;
-    params.n_rank_wo             = 4;
-    params.n_rank_ffn_norm       = 1;
-    params.n_rank_ffn_gate       = 4;
-    params.n_rank_ffn_down       = 4;
-    params.n_rank_ffn_up         = 4;
-    params.n_rank_tok_embeddings = 4;
-    params.n_rank_norm           = 1;
-    params.n_rank_output         = 4;
-
-    params.custom_n_rank_attention_norm = false;
-    params.custom_n_rank_wq             = false;
-    params.custom_n_rank_wk             = false;
-    params.custom_n_rank_wv             = false;
-    params.custom_n_rank_wo             = false;
-    params.custom_n_rank_ffn_norm       = false;
-    params.custom_n_rank_ffn_gate       = false;
-    params.custom_n_rank_ffn_down       = false;
-    params.custom_n_rank_ffn_up         = false;
-    params.custom_n_rank_tok_embeddings = false;
-    params.custom_n_rank_norm           = false;
-    params.custom_n_rank_output         = false;
-
-    return params;
-}
-
-static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-
-    fprintf(stderr, "  --model-base FNAME         model path from which to load base model (default '%s')\n", params->fn_model_base);
-    fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
-    fprintf(stderr, "  --only-write-lora          only save llama lora, don't do any training.  use this if you only want to convert a checkpoint to a lora adapter.\n");
-    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
-    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-    fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
-    fprintf(stderr, "  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r);
-    fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-ffn-norm N          LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-out-norm N          LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-tok-embd N          LORA rank for token embeddings tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-out N               LORA rank for output tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wq N                LORA rank for wq tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_gate N          LORA rank for ffn_gate tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_down N          LORA rank for ffn_down tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_up N            LORA rank for ffn_up tensor, overrides default rank.\n");
-
-    print_common_train_usage(argc, argv, &params->common);
-}
-
-static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
-            if (invalid_param) {
-                break;
-            } else if (params->common.print_usage) {
-                train_print_usage(argc, argv, &default_params);
-                exit(0);
-            }
-        } else if (arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "--lora-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_lora_out = argv[i];
-        } else if (arg == "--only-write-lora") {
-            params->only_write_lora = true;
-        } else if (arg == "--norm-rms-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->f_norm_rms_eps = std::stof(argv[i]);
-            params->custom_f_norm_rms_eps = true;
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_base = std::stof(argv[i]);
-            params->custom_rope_freq_base = true;
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_scale = std::stof(argv[i]);
-            params->custom_rope_freq_scale = true;
-        } else if (arg == "--lora-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lora_alpha = std::stoi(argv[i]);
-            params->custom_lora_alpha = true;
-        } else if (arg == "--lora-r") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lora_r = std::stoi(argv[i]);
-        } else if (arg == "--rank-att-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_attention_norm = std::stoi(argv[i]);
-            params->custom_n_rank_attention_norm = true;
-        } else if (arg == "--rank-ffn-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_norm = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_norm = true;
-        } else if (arg == "--rank-out-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_norm = std::stoi(argv[i]);
-            params->custom_n_rank_norm = true;
-        } else if (arg == "--rank-tok-embd") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_tok_embeddings = std::stoi(argv[i]);
-            params->custom_n_rank_tok_embeddings = true;
-        } else if (arg == "--rank-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_output = std::stoi(argv[i]);
-            params->custom_n_rank_output = true;
-        } else if (arg == "--rank-wq") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wq = std::stoi(argv[i]);
-            params->custom_n_rank_wq = true;
-        } else if (arg == "--rank-wk") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wk = std::stoi(argv[i]);
-            params->custom_n_rank_wk = true;
-        } else if (arg == "--rank-wv") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wv = std::stoi(argv[i]);
-            params->custom_n_rank_wv = true;
-        } else if (arg == "--rank-wo") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wo = std::stoi(argv[i]);
-            params->custom_n_rank_wo = true;
-        } else if (arg == "--rank-ffn_gate") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_gate = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_gate = true;
-        } else if (arg == "--rank-ffn_down") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_down = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_down = true;
-        } else if (arg == "--rank-ffn_up") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_up = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_up = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    finish_processing_train_args(&params->common);
-    return true;
-}
-
-struct save_train_files_data {
-    const char            * fn_checkpoint_out;
-    const char            * fn_lora_out;
-    const char            * pattern_fn_it;
-    const char            * fn_latest;
-    struct my_llama_model * model;
-    struct my_llama_lora  * lora;
-};
-
-static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
-
-    int64_t iter = train->opt->iter;
-
-    if (strlen(data->fn_checkpoint_out) > 0) {
-        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train);
-        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->model, data->lora, train);
-    }
-    if (strlen(data->fn_lora_out) > 0) {
-        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora);
-        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->lora);
-    }
-}
-
-static int64_t get_parameter_count(struct my_llama_lora* lora) {
-    int64_t nx = 0;
-    nx += ggml_nelements(lora->tok_embeddings_a);
-    nx += ggml_nelements(lora->tok_embeddings_b);
-    nx += ggml_nelements(lora->norm_a);
-    nx += ggml_nelements(lora->norm_b);
-    nx += ggml_nelements(lora->output_a);
-    nx += ggml_nelements(lora->output_b);
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        nx += ggml_nelements(layer.attention_norm_a);
-        nx += ggml_nelements(layer.attention_norm_b);
-        nx += ggml_nelements(layer.wq_a);
-        nx += ggml_nelements(layer.wq_b);
-        nx += ggml_nelements(layer.wk_a);
-        nx += ggml_nelements(layer.wk_b);
-        nx += ggml_nelements(layer.wv_a);
-        nx += ggml_nelements(layer.wv_b);
-        nx += ggml_nelements(layer.wo_a);
-        nx += ggml_nelements(layer.wo_b);
-        nx += ggml_nelements(layer.ffn_norm_a);
-        nx += ggml_nelements(layer.ffn_norm_b);
-        nx += ggml_nelements(layer.ffn_gate_a);
-        nx += ggml_nelements(layer.ffn_gate_b);
-        nx += ggml_nelements(layer.ffn_down_a);
-        nx += ggml_nelements(layer.ffn_down_b);
-        nx += ggml_nelements(layer.ffn_up_a);
-        nx += ggml_nelements(layer.ffn_up_b);
-    }
-    return nx;
-}
-
-int main(int argc, char ** argv) {
-    struct train_params params = get_default_train_params();
-
-    if (!train_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    if (params.common.seed == LLAMA_DEFAULT_SEED) {
-        params.common.seed = time(NULL);
-    }
-    printf("%s: seed: %u\n", __func__, params.common.seed);
-    srand(params.common.seed);
-
-    struct llama_model_params llama_mparams = llama_model_default_params();
-    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
-    llama_mparams.vocab_only = false;
-
-    printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams);
-
-    struct llama_context_params llama_cparams = llama_context_default_params();
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams);
-
-    struct my_llama_model model;
-    init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx);
-
-    struct my_llama_lora lora;
-
-    struct train_state      * train = init_train_state();
-    struct ggml_opt_context * opt   = train->opt;
-
-    // set params from command line
-    if (params.custom_f_norm_rms_eps) {
-        model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
-    }
-    if (params.custom_rope_freq_base) {
-        model.hparams.rope_freq_base  = params.rope_freq_base;
-    }
-    if (params.custom_rope_freq_scale) {
-        model.hparams.rope_freq_scale = params.rope_freq_scale;
-    }
-    lora.hparams.lora_r                = params.lora_r;
-    lora.hparams.lora_alpha            = params.custom_lora_alpha            ? params.lora_alpha            : params.lora_r;
-    uint32_t n_rank_attention_norm     = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
-    uint32_t n_rank_wq                 = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
-    uint32_t n_rank_wk                 = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
-    uint32_t n_rank_wv                 = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
-    uint32_t n_rank_wo                 = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
-    uint32_t n_rank_ffn_norm           = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    uint32_t n_rank_ffn_gate           = params.custom_n_rank_ffn_gate       ? params.n_rank_ffn_gate       : params.lora_r;
-    uint32_t n_rank_ffn_down           = params.custom_n_rank_ffn_down       ? params.n_rank_ffn_down       : params.lora_r;
-    uint32_t n_rank_ffn_up             = params.custom_n_rank_ffn_up         ? params.n_rank_ffn_up         : params.lora_r;
-    uint32_t n_rank_tok_embeddings     = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
-    uint32_t n_rank_norm               = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
-    uint32_t n_rank_output             = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
-    lora.hparams.n_rank_attention_norm = n_rank_attention_norm;
-    lora.hparams.n_rank_wq             = n_rank_wq;
-    lora.hparams.n_rank_wk             = n_rank_wk;
-    lora.hparams.n_rank_wv             = n_rank_wv;
-    lora.hparams.n_rank_wo             = n_rank_wo;
-    lora.hparams.n_rank_ffn_norm       = n_rank_ffn_norm;
-    lora.hparams.n_rank_ffn_gate       = n_rank_ffn_gate;
-    lora.hparams.n_rank_ffn_down       = n_rank_ffn_down;
-    lora.hparams.n_rank_ffn_up         = n_rank_ffn_up;
-    lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
-    lora.hparams.n_rank_norm           = n_rank_norm;
-    lora.hparams.n_rank_output         = n_rank_output;
-
-    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    opt->params.print_forward_graph     = false;
-    opt->params.print_backward_graph    = false;
-    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
-    opt->params.n_threads               = params.common.n_threads;
-    opt->params.past                    = params.common.opt_past;
-    opt->params.delta                   = params.common.opt_delta;
-    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
-    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
-    opt->params.adam.n_iter             = params.common.adam_n_iter;
-    opt->params.adam.sched              = 1.0f;
-    opt->params.adam.alpha              = params.common.adam_alpha;
-    opt->params.adam.decay              = params.common.adam_decay;
-    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
-    opt->params.adam.beta1              = params.common.adam_beta1;
-    opt->params.adam.beta2              = params.common.adam_beta2;
-    opt->params.adam.gclip              = params.common.adam_gclip;
-    opt->params.adam.eps_f              = params.common.adam_eps_f;
-
-    printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
-
-    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
-        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
-        }
-
-        const bool opt_param_count_changed = (
-           (lora.hparams.n_rank_attention_norm != n_rank_attention_norm)
-        || (lora.hparams.n_rank_wq             != n_rank_wq)
-        || (lora.hparams.n_rank_wk             != n_rank_wk)
-        || (lora.hparams.n_rank_wv             != n_rank_wv)
-        || (lora.hparams.n_rank_wo             != n_rank_wo)
-        || (lora.hparams.n_rank_ffn_norm       != n_rank_ffn_norm)
-        || (lora.hparams.n_rank_ffn_gate       != n_rank_ffn_gate)
-        || (lora.hparams.n_rank_ffn_down       != n_rank_ffn_down)
-        || (lora.hparams.n_rank_ffn_up         != n_rank_ffn_up)
-        || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
-        || (lora.hparams.n_rank_norm           != n_rank_norm)
-        || (lora.hparams.n_rank_output         != n_rank_output)
-        );
-
-        const bool opt_past_changed = opt->params.past != params.common.opt_past;
-
-        if (opt_param_count_changed) {
-            print_lora_params(&lora.hparams);
-            die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting.");
-            // need to discard previous optimizer gradient statistics and opt_init with new shapes
-            // TODO
-        }
-        if (opt_past_changed) {
-            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
-            // need to discard previous optimizer past function value statistics and opt_init with new shapes
-            // TODO
-        }
-    } else { // existed == false
-        init_lora(&model, &lora);
-        randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-        if (!params.only_write_lora) {
-            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora));
-        }
-    }
-    opt->iter = train->train_its;
-
-    print_params(&model.hparams);
-    print_lora_params(&lora.hparams);
-    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
-    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
-    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
-    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
-
-    if (params.only_write_lora) {
-        save_train_files_data save_data;
-        save_data.fn_checkpoint_out = "";
-        save_data.fn_lora_out       = params.fn_lora_out;
-        save_data.pattern_fn_it     = params.common.pattern_fn_it;
-        save_data.fn_latest         = params.common.fn_latest;
-        save_data.model             = &model;
-        save_data.lora              = &lora;
-
-        save_train_files(&save_data, train);
-
-        free_train_state(train);
-        ggml_free(lora.ctx);
-        llama_free(lctx);
-        llama_free_model(lmodel);
-        return 0;
-    }
-
-    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    printf("%s: opt iter %d\n", __func__, opt->iter);
-
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
-    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
-    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
-
-    // the input tensors
-    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
-    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
-    // allocate input tensors
-    // measure required memory for input tensors
-    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
-
-    // context for compute tensors without their data
-    const size_t estimated_compute_size_wo_data = (
-            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
-            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
-    );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
-    struct ggml_context * ctx_compute = NULL;
-
-    struct ggml_tensor * loss   = NULL;
-    struct ggml_tensor * logits = NULL;
-
-    struct ggml_cgraph * gf     = NULL;
-    struct ggml_cgraph * gb     = NULL;
-    struct ggml_cgraph * gb_tmp = NULL;
-
-    // measure required memory for compute tensors
-    size_t best_compute_size = SIZE_MAX;
-    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
-    // find best evaluation order
-    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
-        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gf->order = (enum ggml_cgraph_eval_order) order;
-        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-            : NULL;
-        loss = llama_build_lora_finetune_graphs(
-            &model, &lora, alloc, ctx_compute,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.common.use_flash,
-            params.common.use_checkpointing,
-            true
-        );
-        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
-        if (max_compute_size < best_compute_size) {
-            best_compute_size = max_compute_size;
-            best_order = gf->order;
-        }
-        ggml_gallocr_free(alloc);
-        ggml_free(ctx_compute);
-    }
-    size_t max_compute_size = best_compute_size;
-    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
-    printf("%s: evaluation order = %s\n", __func__,
-        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
-        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
-        "invalid");
-
-    // allocate compute tensors
-    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gf->order = best_order;
-    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gb_tmp = params.common.use_checkpointing
-        ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-        : NULL;
-    loss = llama_build_lora_finetune_graphs(
-        &model, &lora, alloc, ctx_compute,
-        gf, gb, gb_tmp,
-        &logits, tokens_input, target_probs,
-        n_tokens, n_batch,
-        params.common.use_flash,
-        params.common.use_checkpointing,
-        false
-    );
-
-    // tokenize data
-    std::vector<llama_token> train_tokens;
-    std::vector<size_t> train_samples_begin;
-    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
-    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
-    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
-    tokenize_file(lctx,
-            params.common.fn_train_data,
-            params.common.sample_start,
-            params.common.include_sample_start,
-            params.common.overlapping_samples,
-            n_tokens,
-            train_tokens,
-            train_samples_begin,
-            train_samples_size);
-    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
-
-    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
-
-    std::vector<size_t> token_noccurs;
-    token_noccurs.resize(model.hparams.n_vocab, 0);
-    for (unsigned int i = 0; i < train_tokens.size(); ++i) {
-        ++token_noccurs[train_tokens[i]];
-    }
-    int n_unique_tokens = 0;
-    for (unsigned int i = 0; i < token_noccurs.size(); ++i) {
-        if (token_noccurs[i] == 0) continue;
-        ++n_unique_tokens;
-    }
-    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
-
-    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
-    if (changed_train_data) {
-        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
-    }
-    if (params.common.force_reshuffle) {
-        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
-    }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
-        train->shuffle_sample_count = train_samples_size.size();
-        train->shuffle_next_sample = 0;
-        train->shuffle_samples_hash = shuffle_samples_hash;
-    }
-    std::vector<size_t> train_shuffled_samples_offs;
-    std::vector<size_t> train_shuffled_samples_begin;
-    std::vector<size_t> train_shuffled_samples_size;
-    train_shuffled_samples_offs.resize(train_samples_begin.size());
-    train_shuffled_samples_begin.resize(train_samples_begin.size());
-    train_shuffled_samples_size.resize(train_samples_size.size());
-    train->shuffle_rng_state_next = shuffle_samples(
-        train->shuffle_rng_state_current,
-        train_shuffled_samples_offs.data(),
-        train_shuffled_samples_begin.data(),
-        train_shuffled_samples_size.data(),
-        train_samples_begin.data(),
-        train_samples_size.data(),
-        train_samples_size.size());
-
-    printf("%s: begin training\n", __func__);
-
-    save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
-    save_data.fn_lora_out       = params.fn_lora_out;
-    save_data.pattern_fn_it     = params.common.pattern_fn_it;
-    save_data.fn_latest         = params.common.fn_latest;
-    save_data.model             = &model;
-    save_data.lora              = &lora;
-
-    struct train_opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params.common;
-    opt_cb_data.train                  = train;
-    opt_cb_data.save_cb                = &save_train_files;
-    opt_cb_data.save_data              = &save_data;
-    opt_cb_data.lctx                   = lctx;
-    opt_cb_data.last_save_iter         = opt->iter;
-    opt_cb_data.tokens_data            = train_tokens.data();
-    opt_cb_data.tokens_size            = train_tokens.size();
-    opt_cb_data.samples_begin          = train_samples_begin.data();
-    opt_cb_data.samples_size           = train_samples_size.data();
-    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
-    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
-    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
-    opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input           = tokens_input;
-    opt_cb_data.target_probs           = target_probs;
-    opt_cb_data.first_iter             = opt->iter;
-    opt_cb_data.first_epoch            = train->train_epochs;
-    opt_cb_data.iter_at_last_epoch     = -1;
-    opt_cb_data.last_time              = ggml_time_ms();
-    opt_cb_data.millis_per_iter        = 0.0;
-
-    // measure required memory for work buffer
-    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
-
-    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
-    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
-
-    int64_t t0 = ggml_time_ms();
-
-    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_compute);
-    ggml_free(ctx_input);
-    ggml_gallocr_free(alloc);
-
-
-    int64_t t1 = ggml_time_ms();
-    printf("%s: total training time: ", __func__);
-    print_duration((double) (t1 - t0));
-    printf("\n");
-
-    int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    if (new_iters > 0) {
-        train->train_its     += new_iters;
-        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
-
-        save_train_files(&save_data, train);
-        opt_cb_data.last_save_iter = opt->iter;
-    }
-
-    ggml_free(opt->ctx);
-    free_train_state(train);
-    ggml_free(lora.ctx);
-    llama_free(lctx);
-    llama_free_model(lmodel);
-    return 0;
-}
diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
deleted file mode 100644
index e3cc7f271..000000000
--- a/examples/finetune/finetune.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-cd ../..
-
-EXE="./llama-finetune"
-
-if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
-if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
-
-# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
-
-while getopts "dg" opt; do
-  case $opt in
-    d)
-      DEBUGGER="gdb --args"
-      ;;
-    g)
-      EXE="./build/bin/Release/finetune"
-      GPUARG="--gpu-layers 25"
-      ;;
-  esac
-done
-
-$DEBUGGER $EXE \
-        --model-base $MODEL \
-        $GPUARG \
-        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
-        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
-        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
-        --save-every 10 \
-        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt
deleted file mode 100644
index 9a1d2a35e..000000000
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-train-text-from-scratch)
-add_executable(${TARGET} train-text-from-scratch.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
deleted file mode 100644
index 3abae2380..000000000
--- a/examples/train-text-from-scratch/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# train-text-from-scratch
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# train
-./bin/llama-train-text-from-scratch \
-        --vocab-model ../models/ggml-vocab-llama.gguf \
-        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
-        --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
-        --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
-        --train-data "shakespeare.txt" \
-        -t 6 -b 16 --seed 1 --adam-iter 256 \
-        --no-checkpointing
-
-# predict
-./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
-```
-
-Output files will be saved every N iterations (config with `--save-every N`).
-The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
-
-To train GGUF models just pass them to `--checkpoint-in FN`.
diff --git a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
deleted file mode 100644
index e045beb72..000000000
--- a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
+++ /dev/null
@@ -1,499 +0,0 @@
-#!/usr/bin/env python3
-# train-text-from-scratch checkpoint --> gguf conversion
-
-import argparse
-import os
-import struct
-import sys
-import numpy as np
-from pathlib import Path
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
-import gguf
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.prod(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        assert(tuple(ne) == tuple(self.ne))
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationParamsV0:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.type                 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_threads            = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.past                 = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.delta                = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.adam_n_iter          = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_sched           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_decay           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_alpha           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta1           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta2           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_f           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_g           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_n_iter         = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_eps            = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_ftol           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_wolfe          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_min_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_linesearch     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version == 0:
-            params = OptimizationParamsV0()
-            offset = params.load(data, offset)
-            self.past = params.past
-            self.lbfgs_m = params.lbfgs_m
-            self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-            self.type = params.type
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            if self.type == 0:
-                # these tensors are stored, but we don't need their data
-                x  = Tensor('f', [self.nx])
-                g  = Tensor('f', [self.nx])
-                g2 = Tensor('f', [self.nx])
-                mh = Tensor('f', [self.nx])
-                vh = Tensor('f', [self.nx])
-
-                offset = x.load(data, offset)
-                offset = g.load(data, offset)
-                offset = g2.load(data, offset)
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = mh.load(data, offset)
-                offset = vh.load(data, offset)
-                offset = self.adam_pf.load(data, offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            else:
-                raise ValueError('Unknown optimizer type')
-
-
-        elif self.version == 1:
-            self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            # forgot to save type in version 1:
-            # guess self.type from number of remaining bytes
-            size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                    [self.adam_m, self.adam_v]
-                                    +([self.adam_pf] if (self.past > 0) else [])])
-            size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                    [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                     self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                     self.lbfgs_lmal, self.lbfgs_lmys,
-                                     self.lbfgs_lms, self.lbfgs_lmy]
-                                     +([self.lbfgs_pf] if (self.past > 0) else [])])
-            # due to alignment padding the size might not by exact
-            # but the difference in size for both types is significant,
-            # so we can just use whichever is closest
-            remaining = len(data) - offset
-            if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-                self.type = 0
-            else:
-                self.type = 1
-
-            if self.type == 0:
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = self.adam_pf.load(data,offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError('Invalid version of checkpoint file')
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class ModelParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-        return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
-
-class Layer:
-    def __init__(self, params, bid):
-        self.bid = bid
-        self.att_norm = Tensor('f', [params.n_embd])
-        self.wq       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wk       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wv       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wo       = Tensor('f', [params.n_embd, params.n_embd])
-        self.ffn_norm = Tensor('f', [params.n_embd])
-        self.w1       = Tensor('f', [params.n_embd, params.get_n_ff()])
-        self.w2       = Tensor('f', [params.get_n_ff(), params.n_embd])
-        self.w3       = Tensor('f', [params.n_embd, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm.load(data, offset)
-        offset = self.wq.load(data, offset)
-        offset = self.wk.load(data, offset)
-        offset = self.wv.load(data, offset)
-        offset = self.wo.load(data, offset)
-        offset = self.ffn_norm.load(data, offset)
-        offset = self.w1.load(data, offset)
-        offset = self.w2.load(data, offset)
-        offset = self.w3.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
-        self.wq.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid))
-        self.wk.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid))
-        self.wv.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid))
-        self.wo.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid))
-        self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid))
-        self.w1.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid))
-        self.w2.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid))
-        self.w3.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid))
-
-class Model:
-    def __init__(self):
-        self.params = ModelParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-
-        self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-        self.norm     = Tensor('f', [self.params.n_embd])
-        self.output   = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-
-        offset = self.tok_embd.load(data, offset)
-        offset = self.norm.load(data, offset)
-        offset = self.output.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-
-        self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
-        self.norm.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
-        self.output.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class Checkpoint:
-    def __init__(self):
-        self.model = Model()
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcp':
-            raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = Checkpoint()
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
deleted file mode 100644
index b779f6bd4..000000000
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ /dev/null
@@ -1,1253 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "common.h"
-#include "train.h"
-#include "llama.h"
-#include <unordered_map>
-#include <vector>
-#include <cassert>
-#include <climits>
-#include <cstring>
-#include <cstdarg>
-#include <ctime>
-#include <random>
-#include <stdexcept>
-#include <algorithm>
-#include <string>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;
-    uint32_t n_embd  = 4096;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-    uint32_t n_ff    = 11008;
-
-    // float f_norm_eps     = 1e-5f; // falcon
-    float f_norm_rms_eps = 1e-5f; // llama
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-};
-
-struct my_llama_model {
-    struct ggml_context * ctx = NULL;
-    ggml_backend_buffer_t data = NULL;
-
-    my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-};
-
-// gguf constants (sync with gguf.py)
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
-
-static const char * LLM_KV_GENERAL_NAME                = "general.name";
-static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-static const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
-static const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
-static const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
-static const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
-static const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
-static const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
-static const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
-static const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
-static const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
-static const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
-
-static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-static const char * LLM_TENSOR_OUTPUT        = "output";
-static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
-}
-
-static void set_param_model(struct my_llama_model * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.ffn_gate);
-        ggml_set_param(ctx, layer.ffn_down);
-        ggml_set_param(ctx, layer.ffn_up);
-    }
-}
-
-static void init_model(struct my_llama_model * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_ff    = hparams.n_ff;
-
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-    // context for model tensors without their data
-    struct ggml_init_params ctx_model_params;
-    ctx_model_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
-    ctx_model_params.mem_buffer = NULL;
-    ctx_model_params.no_alloc   = true;
-
-    struct ggml_context * ctx = ggml_init(ctx_model_params);
-    model->ctx = ctx;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-
-    ggml_set_name(model->tok_embeddings, tn(LLM_TENSOR_TOKEN_EMBD));
-    ggml_set_name(model->norm,           tn(LLM_TENSOR_OUTPUT_NORM));
-    ggml_set_name(model->output,         tn(LLM_TENSOR_OUTPUT));
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-
-        ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
-
-        ggml_set_name(layer.wq,             tni(LLM_TENSOR_ATTN_Q, i));
-        ggml_set_name(layer.wk,             tni(LLM_TENSOR_ATTN_K, i));
-        ggml_set_name(layer.wv,             tni(LLM_TENSOR_ATTN_V, i));
-        ggml_set_name(layer.wo,             tni(LLM_TENSOR_ATTN_OUT, i));
-
-        ggml_set_name(layer.ffn_norm,       tni(LLM_TENSOR_FFN_NORM, i));
-
-        ggml_set_name(layer.ffn_gate,       tni(LLM_TENSOR_FFN_GATE, i));
-        ggml_set_name(layer.ffn_down,       tni(LLM_TENSOR_FFN_DOWN, i));
-        ggml_set_name(layer.ffn_up,         tni(LLM_TENSOR_FFN_UP, i));
-    }
-
-    set_param_model(model);
-
-    // allocate data
-    model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-}
-
-static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings, rnd);
-    randomize_tensor_normal(model->norm,           rnd);
-    randomize_tensor_normal(model->output,         rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
-
-        randomize_tensor_normal(layer.wq, rnd);
-        randomize_tensor_normal(layer.wk, rnd);
-        randomize_tensor_normal(layer.wv, rnd);
-        randomize_tensor_normal(layer.wo, rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, rnd);
-
-        randomize_tensor_normal(layer.ffn_gate, rnd);
-        randomize_tensor_normal(layer.ffn_down, rnd);
-        randomize_tensor_normal(layer.ffn_up,   rnd);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static struct ggml_tensor * llama_build_train_graphs(
-        struct my_llama_model * model,
-        ggml_gallocr_t          alloc,
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        const  int              n_tokens,
-        const  int              n_batch,
-        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing,
-        const  bool             measure_only) {
-
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
-    const int n_past = 0;
-    const int N = n_tokens;
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = hparams.n_ff;
-    const float f_norm_rms_eps  = hparams.f_norm_rms_eps;
-    const float rope_freq_base  = hparams.rope_freq_base;
-    const float rope_freq_scale = hparams.rope_freq_scale;
-
-    auto set_name = [](struct ggml_tensor * t, const char * n) {
-        ggml_set_name(t, n);
-        if (t->grad) {
-            ggml_format_name(t->grad, "%s->grad", n);
-        }
-    };
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_set_input(KQ_pos);
-
-    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
-                (struct ggml_tensor * t) -> struct ggml_tensor * {
-        // not capturing these, to silcence warnings
-        const int rope_mode = 0;
-
-        return ggml_rope_ext(
-            ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-        );
-    };
-
-    set_name(tokens_input, "tokens_input");
-    set_name(targets,      "targets");
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    struct ggml_tensor * cur = t01;
-
-    std::vector<struct ggml_tensor *> checkpoints;
-    checkpoints.push_back(tokens_input);
-    checkpoints.push_back(targets);
-    checkpoints.push_back(t00);
-    checkpoints.push_back(t01);
-
-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, f_norm_rms_eps);                    set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = rope              (t06);                                         set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = rope              (t09);                                         set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        struct ggml_tensor * t16;
-        if (enable_flash_attn) {
-            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
-            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, f_norm_rms_eps);                    set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.ffn_up, t24);                      set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.ffn_gate, t24);                    set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.ffn_down, t28);                    set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
-        cur = t30;
-        checkpoints.push_back(cur);
-    }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, f_norm_rms_eps);                 set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
-
-    checkpoints.push_back(t31);
-    checkpoints.push_back(t32);
-    checkpoints.push_back(t33);
-    checkpoints.push_back(t34);
-    checkpoints.push_back(t35);
-    checkpoints.push_back(t36);
-
-    ggml_build_forward_expand(gf, t36);
-
-    if (enable_checkpointing) {
-        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
-    } else {
-        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, true);
-    }
-
-    if (alloc) {
-        // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
-        int n_leafs_before = gb->n_leafs;
-        int n_nodes_before = gb->n_nodes;
-        // output tensors
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
-        // input gradient
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
-        // KQ_pos
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
-        GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-        ggml_set_input(t36->grad);
-
-        // allocating checkpoints in one block to reduce memory fragmentation
-        // note: they will be freed in reverse order
-        for (int i = 0; i < (int) checkpoints.size(); ++i) {
-            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-                ggml_set_input(checkpoints[i]);
-            }
-        }
-
-        //int n_leafs_after = gb->n_leafs;
-        //int n_nodes_after = gb->n_nodes;
-        if (measure_only) {
-            // FIXME: will still allocate
-            ggml_gallocr_reserve(alloc, gb);
-        } else {
-            ggml_gallocr_alloc_graph(alloc, gb);
-
-            if (!measure_only) {
-                int * data = (int *) KQ_pos->data;
-                for (int i = 0; i < N; ++i) {
-                    data[i] = n_past + i;
-                }
-            }
-        }
-
-        // remove the additional nodes and leafs
-        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
-            gb->leafs[i] = NULL;
-        }
-        for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
-            gb->nodes[i] = NULL;
-        }
-        gb->n_leafs = n_leafs_before;
-        gb->n_nodes = n_nodes_before;
-    }
-
-    *logits = t35;
-    return t36;
-}
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-do { \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-} while (0)
-
-static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-    std::string arch;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [&arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
-        return keybuf.data();
-    };
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    GGML_ASSERT(arch == "llama");
-
-    uint32_t ftype_u;
-    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
-    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
-
-    // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
-
-    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
-
-    model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
-    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        model->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
-    }
-
-    init_model(model);
-
-    copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD));
-    copy_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
-    copy_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i));
-        copy_tensor_by_name(layer.wq,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i));
-        copy_tensor_by_name(layer.wk,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i));
-        copy_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
-        copy_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
-        copy_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
-        copy_tensor_by_name(layer.ffn_gate,       f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
-        copy_tensor_by_name(layer.ffn_down,       f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
-        copy_tensor_by_name(layer.ffn_up,         f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
-    }
-}
-
-static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
-    const char * arch = "llama";
-
-    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch);
-        return keybuf.data();
-    };
-
-    // set arch
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_NAME, arch);
-    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
-
-    // set hparams
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx                  );
-    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd                 );
-    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff                   );
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head                 );
-    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer                );
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot                  );
-
-    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps         );
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base         ); // TODO load in llama.cpp
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           1.0f / model->hparams.rope_freq_scale );
-
-    // set vocab by copying from vocab_model gguf file
-    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
-        struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);
-
-        const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
-        if (token_idx == -1) {
-            die("cannot find tokenizer vocab in model file");
-        }
-        const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);
-
-        const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
-        if (score_idx == -1) {
-            die("cannot find tokenizer scores in model file");
-        }
-
-        const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);
-
-        const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
-        if (toktype_idx == -1) {
-            die("cannot find token type list in GGUF file");
-        }
-
-        const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
-
-        std::string tokenizer_name;
-        GGUF_GET_KEY(vctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
-
-        gguf_set_val_str(fctx, kv(LLM_KV_TOKENIZER_MODEL), tokenizer_name.c_str());
-        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_SCORES), GGUF_TYPE_FLOAT32, scores, n_vocab);
-        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE), GGUF_TYPE_INT32, toktypes, n_vocab);
-
-        int32_t special_bos_id = 1;
-        int32_t special_eos_id = 2;
-        int32_t special_unk_id = 0;
-        int32_t special_sep_id = -1;
-        int32_t special_pad_id = -1;
-        if (tokenizer_name == "llama") {
-            // default special tokens
-            special_bos_id = 1;
-            special_eos_id = 2;
-            special_unk_id = 0;
-            special_sep_id = -1;
-            special_pad_id = -1;
-        } else if (tokenizer_name == "gpt2") {
-            // read and copy bpe merges
-            const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
-            if (merges_keyidx == -1) {
-                die("cannot find tokenizer merges in model file");
-            }
-
-            const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
-
-            std::vector<const char*> merges;
-            merges.resize(n_merges);
-            for (int i = 0; i < n_merges; i++) {
-                merges[i] = gguf_get_arr_str(vctx, merges_keyidx, i);
-            }
-            gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_MERGES), merges.data(), n_merges);
-
-            // default special tokens
-            special_bos_id = 11;
-            special_eos_id = 11;
-            special_unk_id = -1;
-            special_sep_id = -1;
-            special_pad_id = -1;
-        } else {
-            fprintf(stderr, "%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
-            fprintf(stderr, "%s: using default tokenizer: 'llama'", __func__);
-        }
-
-        std::vector<const char*> tokens;
-        tokens.resize(n_vocab);
-        for (uint32_t i = 0; i < n_vocab; i++) {
-            tokens[i] = gguf_get_arr_str(vctx, token_idx, i);
-        }
-        gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_LIST), tokens.data(), n_vocab);
-
-        GGUF_GET_KEY(vctx, special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
-        GGUF_GET_KEY(vctx, special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
-        GGUF_GET_KEY(vctx, special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
-        GGUF_GET_KEY(vctx, special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
-        GGUF_GET_KEY(vctx, special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
-
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_BOS_ID), special_bos_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_EOS_ID), special_eos_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_UNK_ID), special_unk_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_SEP_ID), special_sep_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_PAD_ID), special_pad_id);
-
-        gguf_free(vctx);
-    }
-
-    // add tensors
-    gguf_add_tensor(fctx, model->tok_embeddings);
-    gguf_add_tensor(fctx, model->norm);
-    gguf_add_tensor(fctx, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-
-        gguf_add_tensor(fctx, layer.attention_norm);
-        gguf_add_tensor(fctx, layer.wq);
-        gguf_add_tensor(fctx, layer.wk);
-        gguf_add_tensor(fctx, layer.wv);
-        gguf_add_tensor(fctx, layer.wo);
-        gguf_add_tensor(fctx, layer.ffn_norm);
-        gguf_add_tensor(fctx, layer.ffn_gate);
-        gguf_add_tensor(fctx, layer.ffn_down);
-        gguf_add_tensor(fctx, layer.ffn_up);
-    }
-}
-
-static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_llama_model_gguf(fctx, fn_vocab_model, model);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) {
-    load_llama_model_gguf(fctx, f_ggml_ctx, model);
-    if (load_train_state_gguf(fctx, f_ggml_ctx, train)) {
-        std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
-        GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-        GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    } else {
-        printf("%s: loaded llama model as checkpoint\n", __func__);
-    }
-}
-
-static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    save_llama_model_gguf(fctx, fn_vocab_model, model);
-    save_train_state_gguf(fctx, train);
-}
-
-static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
-    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
-    struct gguf_context * fctx = gguf_init_from_file(filename, params);
-    if (fctx == NULL) {
-        return false;
-    }
-
-    load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
-
-    gguf_free(fctx);
-    return true;
-}
-
-static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_checkpoint_gguf(fctx, fn_vocab_model, model, train);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-struct train_params {
-    struct train_params_common common;
-
-    const char * fn_vocab_model;
-    const char * fn_model_out;
-
-    bool only_write_model;
-
-    int n_ctx;
-    int n_embd;
-    int n_head;
-    int n_layer;
-    int n_ff;
-
-    float f_norm_rms_eps;
-    float rope_freq_base;
-    float rope_freq_scale;
-};
-
-static struct train_params get_default_train_params() {
-    struct train_params params;
-    params.common = get_default_train_params_common();
-    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
-
-    params.only_write_model = false;
-
-    params.n_ctx      =  128;
-    params.n_embd     =  256;
-    params.n_head     =    8;
-    params.n_layer    =   16;
-    params.n_ff       =  768;
-
-    params.f_norm_rms_eps  = 1e-5f;
-    params.rope_freq_base  = 10000.0f;
-    params.rope_freq_scale = 1.0f;
-
-    return params;
-}
-
-static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-
-    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
-    fprintf(stderr, "  --only-write-model         only save llama model, don't do any training. use this if you only want to convert a checkpoint to a model.\n");
-    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
-    fprintf(stderr, "  --ff N                     Feedforward size used for new models. (default %d)\n", params->n_ff);
-    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
-    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
-    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
-    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-
-    print_common_train_usage(argc, argv, &params->common);
-}
-
-static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
-            if (invalid_param) {
-                break;
-            } else if (params->common.print_usage) {
-                train_print_usage(argc, argv, &default_params);
-                exit(0);
-            }
-        } else if (arg == "--vocab-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_vocab_model = argv[i];
-        } else if (arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "--only-write-model") {
-            params->only_write_model = true;
-        } else if (arg == "--embd") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_embd = std::stoi(argv[i]);
-        } else if (arg == "--ff") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_ff = std::stoi(argv[i]);
-        } else if (arg == "--head") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_head = std::stoi(argv[i]);
-        } else if (arg == "--layer") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_layer = std::stoi(argv[i]);
-        } else if (arg == "--norm-rms-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->f_norm_rms_eps = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_base = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_scale = std::stof(argv[i]);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    finish_processing_train_args(&params->common);
-
-    return true;
-}
-
-struct save_train_files_data {
-    const char            * fn_checkpoint_out;
-    const char            * fn_model_out;
-    const char            * fn_vocab_model;
-    const char            * pattern_fn_it;
-    const char            * fn_latest;
-    struct my_llama_model * model;
-};
-
-static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
-    int64_t iter = train->opt->iter;
-
-    if (strlen(data->fn_checkpoint_out) > 0) {
-        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model, train);
-        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model, train);
-
-    }
-    if (strlen(data->fn_model_out) > 0) {
-        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model);
-        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model);
-    }
-}
-
-static int64_t get_parameter_count(struct my_llama_model* model) {
-    int64_t nx = 0;
-    nx += ggml_nelements(model->tok_embeddings);
-    nx += ggml_nelements(model->norm);
-    nx += ggml_nelements(model->output);
-
-    for (uint32_t i = 0; i < model->layers.size(); ++i) {
-        auto & layer = model->layers[i];
-        nx += ggml_nelements(layer.attention_norm);
-        nx += ggml_nelements(layer.wq);
-        nx += ggml_nelements(layer.wk);
-        nx += ggml_nelements(layer.wv);
-        nx += ggml_nelements(layer.wo);
-        nx += ggml_nelements(layer.ffn_norm);
-        nx += ggml_nelements(layer.ffn_gate);
-        nx += ggml_nelements(layer.ffn_down);
-        nx += ggml_nelements(layer.ffn_up);
-    }
-    return nx;
-}
-
-int main(int argc, char ** argv) {
-    struct train_params params = get_default_train_params();
-
-    if (!train_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    if (params.common.seed == LLAMA_DEFAULT_SEED) {
-        params.common.seed = time(NULL);
-    }
-    printf("%s: seed: %u\n", __func__, params.common.seed);
-    srand(params.common.seed);
-
-    struct llama_model_params mparams = llama_model_default_params();
-    mparams.vocab_only = true;
-
-    struct llama_context_params cparams = llama_context_default_params();
-
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, mparams);
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, cparams);
-
-    struct my_llama_model model;
-    model.hparams.n_vocab = llama_n_vocab(lmodel);
-    model.hparams.n_ctx   = params.common.n_ctx;
-    model.hparams.n_embd  = params.n_embd;
-    model.hparams.n_head  = params.n_head;
-    model.hparams.n_layer = params.n_layer;
-    model.hparams.n_ff    = params.n_ff;
-    // llama.cpp requires n_rot to be exactly n_embd / n_head
-    model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
-    model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
-    model.hparams.rope_freq_base  = params.rope_freq_base;
-    model.hparams.rope_freq_scale = params.rope_freq_scale;
-
-    struct train_state      * train = init_train_state();
-    struct ggml_opt_context * opt   = train->opt;
-
-    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    opt->params.print_forward_graph     = false;
-    opt->params.print_backward_graph    = false;
-    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
-    opt->params.n_threads               = params.common.n_threads;
-    opt->params.past                    = params.common.opt_past;
-    opt->params.delta                   = params.common.opt_delta;
-    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
-    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
-    opt->params.adam.n_iter             = params.common.adam_n_iter;
-    opt->params.adam.sched              = 1.0f;
-    opt->params.adam.alpha              = params.common.adam_alpha;
-    opt->params.adam.decay              = params.common.adam_decay;
-    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
-    opt->params.adam.beta1              = params.common.adam_beta1;
-    opt->params.adam.beta2              = params.common.adam_beta2;
-    opt->params.adam.gclip              = params.common.adam_gclip;
-    opt->params.adam.eps_f              = params.common.adam_eps_f;
-
-    printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
-    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
-        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
-        }
-
-        const bool opt_past_changed = opt->params.past != params.common.opt_past;
-
-        if (opt_past_changed) {
-            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value train from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
-            // need to discard previous optimizer past function value statistics and opt_init with new shapes
-            // TODO
-        }
-    } else {
-        init_model(&model);
-        randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-        if (!params.only_write_model) {
-            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model));
-        }
-    }
-    opt->iter = train->train_its;
-
-    print_params(&model.hparams);
-    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
-    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
-    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
-    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f));
-
-    if (params.only_write_model) {
-        save_train_files_data save_data;
-        save_data.fn_checkpoint_out = "";
-        save_data.fn_model_out      = params.fn_model_out;
-        save_data.fn_vocab_model    = params.fn_vocab_model;
-        save_data.pattern_fn_it     = params.common.pattern_fn_it;
-        save_data.fn_latest         = params.common.fn_latest;
-        save_data.model             = &model;
-
-        save_train_files(&save_data, train);
-
-        free_train_state(train);
-        ggml_free(model.ctx);
-        llama_free(lctx);
-        llama_free_model(lmodel);
-        return 0;
-    }
-
-    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    printf("%s: opt iter %d\n", __func__, opt->iter);
-
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
-    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
-    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
-
-    // the input tensors
-    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
-    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
-    // measure required memory for input tensors
-    // allocate input tensors
-    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
-
-    // context for compute tensors without their data
-    const size_t estimated_compute_size_wo_data = (
-            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
-            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
-    );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
-    struct ggml_context * ctx_compute = NULL;
-
-    struct ggml_tensor * loss   = NULL;
-    struct ggml_tensor * logits = NULL;
-
-    struct ggml_cgraph * gf     = NULL;
-    struct ggml_cgraph * gb     = NULL;
-    struct ggml_cgraph * gb_tmp = NULL;
-
-    // measure required memory for compute tensors
-    size_t best_compute_size = SIZE_MAX;
-    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
-    // find best evaluation order
-    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
-        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gf->order = (enum ggml_cgraph_eval_order) order;
-        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-            : NULL;
-        loss = llama_build_train_graphs(
-            &model, alloc, ctx_compute,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.common.use_flash,
-            params.common.use_checkpointing,
-            true
-        );
-        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
-        if (max_compute_size < best_compute_size) {
-            best_compute_size = max_compute_size;
-            best_order = gf->order;
-        }
-        ggml_free(ctx_compute);
-    }
-    size_t max_compute_size = best_compute_size;
-    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
-    printf("%s: evaluation order = %s\n", __func__,
-        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
-        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
-        "invalid");
-
-    // allocate compute tensors
-    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gf->order = best_order;
-    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gb_tmp = params.common.use_checkpointing
-        ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-        : NULL;
-    loss = llama_build_train_graphs(
-        &model, alloc, ctx_compute,
-        gf, gb, gb_tmp,
-        &logits, tokens_input, target_probs,
-        n_tokens, n_batch,
-        params.common.use_flash,
-        params.common.use_checkpointing,
-        false
-    );
-
-    std::vector<llama_token> train_tokens;
-    std::vector<size_t> train_samples_begin;
-    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data\n", __func__);
-    tokenize_file(lctx,
-            params.common.fn_train_data,
-            params.common.sample_start,
-            params.common.include_sample_start,
-            params.common.overlapping_samples,
-            n_tokens,
-            train_tokens,
-            train_samples_begin,
-            train_samples_size);
-    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
-
-    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
-
-    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
-    if (changed_train_data) {
-        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
-    }
-    if (params.common.force_reshuffle) {
-        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
-    }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
-        train->shuffle_sample_count = train_samples_size.size();
-        train->shuffle_next_sample = 0;
-        train->shuffle_samples_hash = shuffle_samples_hash;
-    }
-    std::vector<size_t> train_shuffled_samples_offs;
-    std::vector<size_t> train_shuffled_samples_begin;
-    std::vector<size_t> train_shuffled_samples_size;
-    train_shuffled_samples_offs.resize(train_samples_begin.size());
-    train_shuffled_samples_begin.resize(train_samples_begin.size());
-    train_shuffled_samples_size.resize(train_samples_size.size());
-    train->shuffle_rng_state_next = shuffle_samples(
-        train->shuffle_rng_state_current,
-        train_shuffled_samples_offs.data(),
-        train_shuffled_samples_begin.data(),
-        train_shuffled_samples_size.data(),
-        train_samples_begin.data(),
-        train_samples_size.data(),
-        train_samples_size.size());
-    printf("%s: begin training\n", __func__);
-
-    save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
-    save_data.fn_model_out      = params.fn_model_out;
-    save_data.fn_vocab_model    = params.fn_vocab_model;
-    save_data.pattern_fn_it     = params.common.pattern_fn_it;
-    save_data.fn_latest         = params.common.fn_latest;
-    save_data.model             = &model;
-
-    struct train_opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params.common;
-    opt_cb_data.train                  = train;
-    opt_cb_data.save_cb                = &save_train_files;
-    opt_cb_data.save_data              = &save_data;
-    opt_cb_data.lctx                   = lctx;
-    opt_cb_data.last_save_iter         = opt->iter;
-    opt_cb_data.tokens_data            = train_tokens.data();
-    opt_cb_data.tokens_size            = train_tokens.size();
-    opt_cb_data.samples_begin          = train_samples_begin.data();
-    opt_cb_data.samples_size           = train_samples_size.data();
-    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
-    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
-    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
-    opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input           = tokens_input;
-    opt_cb_data.target_probs           = target_probs;
-    opt_cb_data.first_iter             = opt->iter;
-    opt_cb_data.first_epoch            = train->train_epochs;
-    opt_cb_data.iter_at_last_epoch     = -1;
-    opt_cb_data.last_time              = ggml_time_ms();
-    opt_cb_data.millis_per_iter        = 0.0;
-
-    // measure required memory for work buffer
-    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
-
-    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
-    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
-
-    int64_t t0 = ggml_time_ms();
-
-    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_compute);
-    ggml_free(ctx_input);
-
-    int64_t t1 = ggml_time_ms();
-    printf("%s: total training time: ", __func__);
-    print_duration((double) (t1 - t0));
-    printf("\n");
-
-    int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    if (new_iters > 0) {
-        train->train_its     += new_iters;
-        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
-
-        save_train_files(&save_data, train);
-        opt_cb_data.last_save_iter = opt->iter;
-    }
-
-    ggml_free(opt->ctx);
-    free_train_state(train);
-    ggml_free(model.ctx);
-    llama_free(lctx);
-    llama_free_model(lmodel);
-    return 0;
-}

From eddcb5238b2e09a37798b87cde1244017a194bcc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 25 Jul 2024 12:37:42 +0300
Subject: [PATCH 11/35] ggml : add and use ggml_cpu_has_llamafile() (#8664)

---
 ggml/include/ggml.h | 1 +
 ggml/src/ggml.c     | 8 ++++++++
 src/llama.cpp       | 6 +-----
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2fdb9fa40..548661b9b 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2400,6 +2400,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
     GGML_API int ggml_cpu_has_cann       (void);
+    GGML_API int ggml_cpu_has_llamafile  (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index dbb3a3ebe..f65837e85 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -22005,6 +22005,14 @@ int ggml_cpu_has_cann(void) {
 #endif
 }
 
+int ggml_cpu_has_llamafile(void) {
+#if defined(GGML_USE_LLAMAFILE)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_gpublas(void) {
     return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
 }
diff --git a/src/llama.cpp b/src/llama.cpp
index 9e502018d..80235ae19 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19146,11 +19146,7 @@ const char * llama_print_system_info(void) {
     s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
     s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
-#ifdef GGML_USE_LLAMAFILE
-    s += "LLAMAFILE = 1 | ";
-#else
-    s += "LLAMAFILE = 0 | ";
-#endif
+    s += "LLAMAFILE = "   + std::to_string(ggml_cpu_has_llamafile())   + " | ";
 
     return s.c_str();
 }

From ed67bcb24f2d6ac0072cae72620b2bd971741b98 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Thu, 25 Jul 2024 11:45:18 +0000
Subject: [PATCH 12/35] [SYCL] fix multi-gpu issue on sycl (#8554)

---------

Signed-off-by: Chen Xi <xi2chen@intel.com>
Co-authored-by: Meng, Hengyu <hengyu.meng@intel.com>
---
 docs/backend/SYCL.md               |  36 ++++------
 ggml/src/ggml-sycl/common.hpp      |   2 +-
 ggml/src/ggml-sycl/dpct/helper.hpp | 101 +++++++++++++++++++++++++----
 src/llama.cpp                      |   4 +-
 4 files changed, 102 insertions(+), 41 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 885983e92..d36ac0a15 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 ```sh
 ./build/bin/llama-ls-sycl-device
 ```
-A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
+
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```
 
-| Attribute              | Note                                                        |
-|------------------------|-------------------------------------------------------------|
-| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
-| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
 
 4. Launch inference
 
 There are two device selection modes:
 
 - Single device: Use one device target specified by the user.
-- Multiple devices: Automatically select the devices with the same largest Max compute-units.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
 
 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 build\bin\ls-sycl-device.exe
 ```
 
-The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 
 ```
 
-| Attribute              | Note                                                      |
-|------------------------|-----------------------------------------------------------|
-| compute capability 1.3 | Level-zero running time, recommended                      |
-| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
-
 
 4. Launch inference
 
 There are two device selection modes:
 
-- Single device: Use one device assigned by user.
-- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+- Single device: Use one device assigned by user. Default device id is 0.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
 
 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 68d41411b..397bd98dd 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
 
     queue_ptr stream(int device, int stream) {
         if (qptrs[device][stream] == nullptr) {
-            qptrs[device][stream] = &(dpct::get_current_device().default_queue());
+            qptrs[device][stream] = &(dpct::get_device(device).default_queue());
         }
         return qptrs[device][stream];
     }
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 31df1cb9e..4aaa76bfb 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -588,7 +588,7 @@ namespace dpct
         out = prop;
     }
 
-   /// dpct device extension
+    /// dpct device extension
     class device_ext : public sycl::device {
       typedef std::mutex mutex_type;
 
@@ -697,7 +697,7 @@ namespace dpct
         std::unique_lock<mutex_type> lock(m_mutex);
         lock.unlock();
         for (auto &q : _queues) {
-          q.wait_and_throw();
+            q.wait_and_throw();
         }
         // Guard the destruct of current_queues to make sure the ref count is
         // safe.
@@ -734,7 +734,12 @@ namespace dpct
 
       void destroy_queue(sycl::queue queue) {
         std::lock_guard<mutex_type> lock(m_mutex);
-        _queues.clear();
+        _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                    [=](const sycl::queue &q) -> bool
+                                    {
+                                        return q == queue;
+                                    }),
+                    _queues.end());
       }
       void set_saved_queue(sycl::queue q) {
         std::lock_guard<mutex_type> lock(m_mutex);
@@ -764,13 +769,13 @@ namespace dpct
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        auto q = sycl::queue(*this, eh,
-                             sycl::property_list(
+        _queues.push_back(sycl::queue(
+            *this, eh,
+            sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                                 sycl::property::queue::enable_profiling(),
+                sycl::property::queue::enable_profiling(),
 #endif
-                                 properties...));
-        _queues.push_back(q);
+                properties...)));
 
         return _queues.back();
       }
@@ -783,8 +788,8 @@ namespace dpct
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        _queues.push_back(
-            sycl::queue(device, eh,
+        _queues.push_back(sycl::queue(
+            device, eh,
                         sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                             sycl::property::queue::enable_profiling(),
@@ -855,15 +860,75 @@ namespace dpct
         unsigned int get_device_id(const sycl::device &dev)
         {
             unsigned int id = 0;
-            for (auto dev_item : _devs)
+            for (auto &dev_item : _devs)
             {
                 if (*dev_item == dev)
                 {
-                    break;
+                    return id;
                 }
                 id++;
             }
-            return id;
+            return -1;
+        }
+
+        inline std::string get_preferred_gpu_platform_name() {
+            std::string result;
+
+            std::string filter = "level-zero";
+            char* env = getenv("ONEAPI_DEVICE_SELECTOR");
+            if (env) {
+                if (std::strstr(env, "level_zero")) {
+                    filter = "level-zero";
+                }
+                else if (std::strstr(env, "opencl")) {
+                    filter = "opencl";
+                }
+                else if (std::strstr(env, "cuda")) {
+                    filter = "cuda";
+                }
+                else if (std::strstr(env, "hip")) {
+                    filter = "hip";
+                }
+                else {
+                    throw std::runtime_error("invalid device filter: " + std::string(env));
+                }
+            }
+
+            auto plaform_list = sycl::platform::get_platforms();
+
+            for (const auto& platform : plaform_list) {
+                auto devices = platform.get_devices();
+                auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
+                    return d.is_gpu();
+                });
+
+                if (gpu_dev == devices.end()) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not contain GPU devices, skipping\n";
+                    continue;
+                }
+
+                auto platform_name = platform.get_info<sycl::info::platform::name>();
+                std::string platform_name_low_case;
+                platform_name_low_case.resize(platform_name.size());
+
+                std::transform(
+                    platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
+
+                if (platform_name_low_case.find(filter) == std::string::npos) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not match with requested "
+                    //      << filter << ", skipping\n";
+                    continue;
+                }
+
+                result = platform_name;
+            }
+
+            if (result.empty())
+                throw std::runtime_error("can not find preferred GPU platform");
+
+            return result;
         }
 
         template <class DeviceSelector>
@@ -930,10 +995,15 @@ namespace dpct
             // Keep track of the number of devices per backend
             std::map<sycl::backend, size_t> DeviceNums;
             std::map<std::string, std::vector<sycl::device>> backend_devices;
+            auto preferred_platform_name = get_preferred_gpu_platform_name();
 
             while (!Platforms.empty()) {
                 auto Platform = Platforms.back();
                 Platforms.pop_back();
+                auto platform_name = Platform.get_info<sycl::info::platform::name>();
+                if (platform_name.compare(preferred_platform_name) != 0) {
+                    continue;
+                }
                 auto devices = Platform.get_devices();
                 std::string backend_type = get_device_backend_and_type(devices[0]);
                 for (const auto &device : devices) {
@@ -1989,6 +2059,11 @@ namespace dpct
         return dev_mgr::instance().current_device();
     }
 
+    static inline device_ext &get_device(unsigned int id)
+    {
+        return dev_mgr::instance().get_device(id);
+    }
+
     static inline sycl::queue &get_in_order_queue()
     {
         return dev_mgr::instance().current_device().in_order_queue();
diff --git a/src/llama.cpp b/src/llama.cpp
index 80235ae19..972f870b0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16643,9 +16643,7 @@ struct llama_context * llama_new_context_with_model(
             for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
                 ggml_backend_t backend = ggml_backend_sycl_init(i);
                 if (backend == nullptr) {
-                    int id_list[GGML_SYCL_MAX_DEVICES];
-                    ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
                     llama_free(ctx);
                     return nullptr;
                 }

From 88954f7fbd31aeb8c75140edee03e7a8ad5e2d9c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 25 Jul 2024 18:57:44 +0300
Subject: [PATCH 13/35] tests : fix printfs (#8068)

---
 tests/test-chat-template.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 46a7d3aea..a8222caee 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -146,7 +146,7 @@ int main(void) {
     auto fmt_sys = [&](std::string tmpl) {
         auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
         printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n", output.c_str());
+        printf("-------------------------\n");
         return output;
     };
     assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
@@ -165,7 +165,7 @@ int main(void) {
     auto fmt_single = [&](std::string tmpl) {
         auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
         printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n", output.c_str());
+        printf("-------------------------\n");
         return output;
     };
     assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");

From bf5a81df375f1c71e41462e1f48d57db359c9e80 Mon Sep 17 00:00:00 2001
From: "Andreas (Andi) Kunar" <andreask@msn.com>
Date: Thu, 25 Jul 2024 18:01:00 +0200
Subject: [PATCH 14/35] ggml : fix build on Windows with Snapdragon X (#8531)

* Improvements for Windows with Snapdragon X

* Revert "Improvements for Windows with Snapdragon X"

This reverts commit bf21397ae5ea7c73d3494db3b91505599909227d.

* Improvements for Windows with Snapdragon X

* WOA build clarifications

* WIndows on ARM build clarifications

* cmake build for Windows clarifications

* Update docs/build.md

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: AndreasKunar <andreaskmsn.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 docs/build.md           | 13 ++++++++++++-
 ggml/src/ggml-aarch64.c | 12 ++++++------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index 916fcf22d..d9d12c467 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
       make
       ```
 
-  - On Windows:
+  - On Windows (x86/x64 only, arm64 requires cmake):
 
     1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
     2. Extract `w64devkit` on your pc.
@@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options.
       cmake -B build -G "Xcode"
       cmake --build build --config Debug
       ```
+    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
+      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+        - Tab Workload: Desktop-development with C++
+        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
+      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
+      - For Windows on ARM (arm64, WoA) build with:
+        ```bash
+        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
+        cmake --build build-arm64-windows-llvm-release
+        ```
+        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
 
 -   Using `gmake` (FreeBSD):
 
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 26535b1c4..af53dea17 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
@@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;

From 4226a8d10e3904db3a1297919fe6c7f06beba6c0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 25 Jul 2024 19:57:31 +0300
Subject: [PATCH 15/35] llama : fix build + fix fabs compile warnings (#8683)

ggml-ci
---
 src/llama-grammar.h |  2 --
 src/llama.cpp       | 10 ++--------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index 8e578e09f..695ea0632 100644
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@@ -13,8 +13,6 @@ struct llama_grammar {
     llama_partial_utf8 partial_utf8;
 };
 
-struct llama_grammar * llama_get_grammar(struct llama_context * ctx);
-
 //
 // internal API
 //
diff --git a/src/llama.cpp b/src/llama.cpp
index 972f870b0..bc0183741 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2657,7 +2657,6 @@ struct llama_context {
     llama_context(const llama_model & model)
         : model(model)
         , sampling(llama_n_vocab(&model))
-        , grammar()
         , t_start_us(model.t_start_us)
         , t_load_us(model.t_load_us) {}
 
@@ -2675,7 +2674,6 @@ struct llama_context {
 
     struct llama_cparams        cparams;
     struct llama_sampling       sampling;
-    struct llama_grammar        grammar;
     struct llama_kv_cache       kv_self;
     struct llama_control_vector cvec;
 
@@ -14048,7 +14046,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                             f = -INFINITY;
                         } else {
                             if (hparams.use_alibi) {
-                                f = -fabs(lctx.kv_self.cells[i].pos - pos);
+                                f = -std::abs(lctx.kv_self.cells[i].pos - pos);
                             } else {
                                 f = 0.0f;
                             }
@@ -14102,7 +14100,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                         for (int s = 0; s < batch.n_seq_id[i]; ++s) {
                             if (batch.seq_id[i][s] == seq_id) {
                                 if (hparams.use_alibi) {
-                                    f = -fabs(batch.pos[i] - batch.pos[j]);
+                                    f = -std::abs(batch.pos[i] - batch.pos[j]);
                                 } else {
                                     f = 0.0f;
                                 }
@@ -16833,10 +16831,6 @@ const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
     return &ctx->model.vocab;
 }
 
-struct llama_grammar * llama_get_grammar(struct llama_context * ctx) {
-    return &ctx->grammar;
-}
-
 uint32_t llama_n_ctx(const struct llama_context * ctx) {
     return ctx->cparams.n_ctx;
 }

From 49ce0ab6d45402e8bb622bf86f86529f2b0ba552 Mon Sep 17 00:00:00 2001
From: DavidKorczynski <david@adalogics.com>
Date: Thu, 25 Jul 2024 22:23:05 +0100
Subject: [PATCH 16/35] ggml: handle ggml_init failure to fix NULL pointer
 deref (#8692)

`ggml_init` can fail if no unused context is found. In that case, a NULL-pointer deref will happen later in the code during a call to `ggml_set_on_alloc`.

This fixes it by bailing out if no context is found.
---
 ggml/src/ggml.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f65837e85..29afcc7f8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -21096,6 +21096,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         };
 
         *params.ctx = ggml_init(pdata);
+        if (*params.ctx == NULL) {
+            fprintf(stderr, "%s: failed to initialize context\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
 
         struct ggml_context * ctx_data = *params.ctx;
 

From 41cd47caab88c442edc50e90c8d8d0ac3e82768d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 25 Jul 2024 23:49:39 +0200
Subject: [PATCH 17/35] examples : export-lora : fix issue with quantized base
 models (#8687)

---
 examples/export-lora/export-lora.cpp | 64 +++++++++++++++++-----------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 124ee167d..150f7e8d5 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -211,8 +211,9 @@ struct lora_merge_ctx {
             }
         }
 
-        // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile
-        std::vector<std::pair<struct ggml_tensor *, bool>> base_tensors;
+        // mapping base tensor to out tensor (same shape with base, but different type)
+        // if out_tensor == nullptr, we only copy it
+        std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
         for (auto & it : base_model.tensors) {
             bool t_a = true;
             bool t_b = true;
@@ -221,22 +222,22 @@ struct lora_merge_ctx {
                 t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
             }
             auto base_tensor = it.second;
-            struct ggml_tensor * out_tensor;
             if (!t_a && !t_b) {
                 // only copy
-                out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
-                ggml_set_name(out_tensor, base_tensor->name);
-                base_tensors.push_back(std::make_pair(out_tensor, false));
+                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
+                ggml_set_name(cpy_tensor, base_tensor->name);
+                base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
+                gguf_add_tensor(ctx_out, cpy_tensor);
             } else if (t_a && t_b) {
                 // need merging
-                out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
-                out_tensor->type = get_out_tensor_type(base_tensor);
+                struct ggml_tensor * out_tensor = ggml_new_tensor(
+                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
                 ggml_set_name(out_tensor, base_tensor->name);
-                base_tensors.push_back(std::make_pair(out_tensor, true));
+                base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
+                gguf_add_tensor(ctx_out, out_tensor);
             } else {
                 throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
             }
-            gguf_add_tensor(ctx_out, out_tensor);
         }
 
         // placeholder for the meta data
@@ -247,9 +248,9 @@ struct lora_merge_ctx {
 
         // process base model tensors
         size_t n_merged = 0;
-        for (auto & it : base_tensors) {
-            if (it.second) {
-                merge_tensor(it.first);
+        for (auto & it : base_to_out_tensors) {
+            if (it.second != nullptr) {
+                merge_tensor(it.first, it.second);
                 n_merged++;
             } else {
                 copy_tensor(it.first);
@@ -265,7 +266,7 @@ struct lora_merge_ctx {
         }
 
         printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size());
+        printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
     }
 
     void copy_tensor(struct ggml_tensor * base) {
@@ -276,7 +277,7 @@ struct lora_merge_ctx {
         zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
     }
 
-    void merge_tensor(struct ggml_tensor * base) {
+    void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
         std::string name_base(base->name);
         std::string name_lora_a = name_base + ".lora_a";
         std::string name_lora_b = name_base + ".lora_b";
@@ -287,14 +288,14 @@ struct lora_merge_ctx {
         std::vector<struct ggml_tensor *> inp_a(adapters.size());
         std::vector<struct ggml_tensor *> inp_b(adapters.size());
         struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead()*(1+adapters.size()*2),
+            /*.mem_size   =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true,
         };
         struct ggml_context * ctx = ggml_init(params);
 
         // alloc tensors
-        struct ggml_tensor * inp = ggml_dup_tensor(ctx, base);
+        struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
         for (size_t i = 0; i < adapters.size(); ++i) {
             auto t_a = adapters[i]->get_tensor(name_lora_a);
             auto t_b = adapters[i]->get_tensor(name_lora_b);
@@ -303,9 +304,21 @@ struct lora_merge_ctx {
         }
         ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
 
-        // load data to backend buffer
+        // load base tensor to backend buffer
         base_model.read_tensor_data(name_base, read_buf);
-        ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp));
+        if (base->type != GGML_TYPE_F32) {
+            // optionally dequantize it
+            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
+            auto nels = ggml_nelements(inp_base);
+            ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
+            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
+            qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
+        } else {
+            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
+        }
+
+        // load lora tensors to backend buffer
         for (size_t i = 0; i < adapters.size(); ++i) {
             adapters[i]->read_tensor_data(name_lora_a, read_buf);
             ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
@@ -325,20 +338,21 @@ struct lora_merge_ctx {
             };
             struct ggml_context * ctx0 = ggml_init(params0);
             gf = ggml_new_graph(ctx0);
-            struct ggml_tensor * cur = inp;
+            struct ggml_tensor * cur = inp_base;
             for (size_t i = 0; i < adapters.size(); ++i) {
-                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i]));
-                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]);
+                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
+                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
                 // scale
                 const float alpha = adapters[i]->alpha;
                 const float rank  = (float) inp_b[i]->ne[0];
                 const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
                 delta = ggml_scale(ctx0, delta, scale);
-                cur = ggml_add(ctx0, cur, delta);
-                printf("%s :   + merging from adapter[%ld]\n", __func__, i);
+                cur = ggml_add(ctx0, delta, cur);
+                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
                 printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
             }
-            cur = ggml_cast(ctx0, cur, get_out_tensor_type(base));
+            cur = ggml_cast(ctx0, cur, out->type);
+            printf("%s :   + output type is %s\n", __func__, ggml_type_name(out->type));
             ggml_build_forward_expand(gf, cur);
             ggml_free(ctx0);
         }

From 01aec4a6310ab0160483196db0e726d78d4c94b6 Mon Sep 17 00:00:00 2001
From: Yaiko <elyaiko@hotmail.com>
Date: Thu, 25 Jul 2024 18:10:16 -0400
Subject: [PATCH 18/35] server : add Speech Recognition & Synthesis to UI
 (#8679)

* server : add Speech Recognition & Synthesis to UI

* server : add Speech Recognition & Synthesis to UI (fixes)
---
 examples/server/public/index.html | 180 +++++++++++++++++++++++++++---
 1 file changed, 164 insertions(+), 16 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 48628a960..07fec6a38 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -1,5 +1,4 @@
 <html>
-
 <head>
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
@@ -132,12 +131,20 @@
       align-items: stretch;
     }
 
-    .right {
+    .message-controls {
       display: flex;
-      flex-direction: row;
-      gap: 0.5em;
       justify-content: flex-end;
     }
+    .message-controls > div:nth-child(2) {
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+    }
+    .message-controls > div:nth-child(2) > div {
+      display: flex;
+      margin-left: auto;
+      gap: 0.5em;
+    }
 
     fieldset {
       border: none;
@@ -276,6 +283,7 @@
 
     import { llama } from './completion.js';
     import { SchemaConverter } from './json-schema-to-grammar.mjs';
+
     let selected_image = false;
     var slot_id = -1;
 
@@ -447,6 +455,9 @@
 
     /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
 
+    const tts = window.speechSynthesis;
+    const ttsVoice = signal(null)
+
     const llamaStats = signal(null)
     const controller = signal(null)
 
@@ -596,8 +607,51 @@
       });
     }
 
+    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+    const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
     function MessageInput() {
-      const message = useSignal("")
+      const message = useSignal("");
+
+      const talkActive = useSignal(false);
+      const sendOnTalk = useSignal(false);
+      const talkStop = (e) => {
+        if (e) e.preventDefault();
+
+        talkActive.value = false;
+        talkRecognition?.stop();
+      }
+      const talk = (e) => {
+        e.preventDefault();
+
+        if (talkRecognition)
+          talkRecognition.start();
+        else
+          alert("Speech recognition is not supported by this browser.");
+      }
+      if(talkRecognition) {
+        talkRecognition.onstart = () => {
+          talkActive.value = true;
+        }
+        talkRecognition.onresult = (e) => {
+          if (event.results.length > 0) {
+            message.value = event.results[0][0].transcript;
+            if (sendOnTalk.value) {
+              submit(e);
+            }
+          }
+        }
+        talkRecognition.onspeechend = () => {
+          talkStop();
+        }
+      }
+
+      const ttsVoices = useSignal(tts?.getVoices() || []);
+      const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
+      if (tts) {
+        tts.onvoiceschanged = () => {
+          ttsVoices.value = tts.getVoices();
+        }
+      }
 
       const submit = (e) => {
         stop(e);
@@ -624,11 +678,45 @@
                value="${message}"
             />
           </div>
-          <div class="right">
-            <button type="submit" disabled=${generating.value}>Send</button>
-            <button onclick=${uploadImage}>Upload Image</button>
-            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
-            <button onclick=${reset}>Reset</button>
+          <div class="message-controls">
+            <div> </div>
+            <div>
+              <div>
+                <button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
+                <button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
+                <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+                <button onclick=${reset}>Reset</button>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
+                  `(TTS and speech recognition are not provided by llama.cpp)\n` +
+                  `Note: STT requires HTTPS to work.`);
+                }}>[?]</a>
+                <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
+                <div>
+                  <input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
+                  <label for="send-on-talk" style="line-height: initial;">Send after talking</label>
+                </div>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
+                }}>[?]</a>
+                <label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
+                <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
+                  <option value="" selected="${!ttsVoice.value}">None</option>
+                  ${[
+                    ...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
+                    ...ttsVoices.value.filter(v => !v.default),
+                  ].map(
+                    v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
+                  )}
+                </select>
+              </div>
+            </div>
           </div>
         </form>
       `
@@ -659,26 +747,86 @@
         }
       }, [messages])
 
+      const ttsChatLineActiveIx = useSignal(undefined);
+      const ttsChatLine = (e, ix, msg) => {
+        if (e) e.preventDefault();
+
+        if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
+
+        const ttsVoices = tts.getVoices();
+        const voice = ttsVoices.find(v => v.name === ttsVoice.value);
+        if (!voice) return;
+
+        if (ttsChatLineActiveIx.value !== undefined) {
+          tts.cancel();
+          if (ttsChatLineActiveIx.value === ix) {
+            ttsChatLineActiveIx.value = undefined;
+            return;
+          }
+        }
+
+        ttsChatLineActiveIx.value = ix;
+        let ttsUtter = new SpeechSynthesisUtterance(msg);
+        ttsUtter.voice = voice;
+        ttsUtter.onend = e => {
+          ttsChatLineActiveIx.value = undefined;
+        };
+        tts.speak(ttsUtter);
+      }
+
       const isCompletionMode = session.value.type === 'completion'
+
+      // Try play the last bot message
+      const lastCharChatLinesIxs = useSignal([]);
+      const lastCharChatLinesIxsOld = useSignal([]);
+      useEffect(() => {
+        if (
+          !isCompletionMode
+          && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
+          && !generating.value
+        ) {
+          const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
+          if (ix !== undefined) {
+            const msg = messages[ix];
+            ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
+          }
+
+          lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
+        }
+      }, [generating.value]);
+
       const chatLine = ([user, data], index) => {
         let message
-        const isArrayMessage = Array.isArray(data)
+        const isArrayMessage = Array.isArray(data);
+        const text = isArrayMessage ?
+            data.map(msg => msg.content).join('') :
+            data;
         if (params.value.n_probs > 0 && isArrayMessage) {
           message = html`<${Probabilities} data=${data} />`
         } else {
-          const text = isArrayMessage ?
-            data.map(msg => msg.content).join('') :
-            data;
           message = isCompletionMode ?
             text :
             html`<${Markdownish} text=${template(text)} />`
         }
+
+        const fromBot = user && user === '{{char}}';
+        if (fromBot && !lastCharChatLinesIxs.value.includes(index))
+          lastCharChatLinesIxs.value.push(index);
+
         if (user) {
-          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+          return html`
+          <div>
+            <p key=${index}><strong>${template(user)}:</strong> ${message}</p>
+            ${
+              fromBot && ttsVoice.value
+              && html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
+            }
+          </div>
+          `;
         } else {
           return isCompletionMode ?
             html`<span key=${index}>${message}</span>` :
-            html`<p key=${index}>${message}</p>`
+            html`<div><p key=${index}>${message}</p></div>`
         }
       };
 

From 01245f5b1629075543bc4478418c7d72a0b4b3c7 Mon Sep 17 00:00:00 2001
From: Judd <foldl@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:38:12 +0800
Subject: [PATCH 19/35] llama : fix order of parameters (#8706)

usage of `aclrtGetMemInfo` is correct:

https://www.hiascend.com/doc_center/source/zh/canncommercial/63RC2/inferapplicationdev/aclcppdevg/aclcppdevg_03_0103.html

Co-authored-by: Judd <foldl@boxvest.com>
---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index bc0183741..77f7d32f8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2905,7 +2905,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
 #elif defined(GGML_USE_CANN)
     size_t total;
     size_t free;
-    ggml_backend_cann_get_device_memory(device, &total, &free);
+    ggml_backend_cann_get_device_memory(device, &free, &total);
     return free;
 #else
     return 1;

From 2b1f616b208a4a21c4ee7a7eb85d822ff1d787af Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 27 Jul 2024 04:41:55 +0200
Subject: [PATCH 20/35] ggml : reduce hash table reset cost (#8698)

* ggml : reduce hash table reset cost

* fix unreachable code warnings after GGML_ASSERT(false)

* GGML_ASSERT(false) -> GGML_ABORT("fatal error")

* GGML_ABORT use format string
---
 Makefile                                 |   6 +-
 examples/eval-callback/eval-callback.cpp |   2 +-
 examples/imatrix/imatrix.cpp             |   4 +-
 examples/llama-bench/llama-bench.cpp     |   6 +-
 examples/llava/clip.cpp                  |   2 +-
 examples/tokenize/tokenize.cpp           |   2 +-
 ggml/include/ggml.h                      |  37 +-
 ggml/src/ggml-alloc.c                    |  42 +-
 ggml/src/ggml-backend.c                  | 214 +++---
 ggml/src/ggml-blas.cpp                   |   3 +-
 ggml/src/ggml-cann.cpp                   |   6 +-
 ggml/src/ggml-cann/aclnn_ops.cpp         |  26 +-
 ggml/src/ggml-cuda.cu                    |   6 +-
 ggml/src/ggml-cuda/argsort.cu            |   2 +-
 ggml/src/ggml-cuda/binbcast.cu           |   2 +-
 ggml/src/ggml-cuda/common.cuh            |   2 +-
 ggml/src/ggml-cuda/cpy.cu                |   4 +-
 ggml/src/ggml-cuda/dmmv.cu               |   2 +-
 ggml/src/ggml-cuda/fattn-common.cuh      |   6 +-
 ggml/src/ggml-cuda/fattn-tile-f16.cu     |   2 +-
 ggml/src/ggml-cuda/fattn-tile-f32.cu     |   2 +-
 ggml/src/ggml-cuda/fattn.cu              |  10 +-
 ggml/src/ggml-cuda/getrows.cu            |   3 +-
 ggml/src/ggml-cuda/mmq.cu                |   2 +-
 ggml/src/ggml-cuda/mmq.cuh               |   4 +-
 ggml/src/ggml-cuda/mmvq.cu               |   6 +-
 ggml/src/ggml-cuda/quantize.cu           |   2 +-
 ggml/src/ggml-cuda/rope.cu               |   4 +-
 ggml/src/ggml-impl.h                     | 116 +++-
 ggml/src/ggml-kompute.cpp                |   8 +-
 ggml/src/ggml-metal.m                    |  42 +-
 ggml/src/ggml-quants.c                   |  12 +-
 ggml/src/ggml-sycl.cpp                   |  16 +-
 ggml/src/ggml-sycl/common.hpp            |   2 +-
 ggml/src/ggml-sycl/dmmv.cpp              |   2 +-
 ggml/src/ggml-sycl/dpct/helper.hpp       |   2 +-
 ggml/src/ggml-sycl/mmq.cpp               |  22 +-
 ggml/src/ggml-sycl/mmvq.cpp              |   2 +-
 ggml/src/ggml-sycl/rope.cpp              |   4 +-
 ggml/src/ggml-vulkan.cpp                 |  58 +-
 ggml/src/ggml.c                          | 838 +++++++++++------------
 src/llama-grammar.cpp                    |   4 +-
 src/llama-vocab.cpp                      |  14 +-
 src/llama.cpp                            |  36 +-
 tests/test-backend-ops.cpp               |   8 +-
 tests/test-sampling.cpp                  |  10 +-
 46 files changed, 851 insertions(+), 754 deletions(-)

diff --git a/Makefile b/Makefile
index 8d2ccddc4..7e015af3e 100644
--- a/Makefile
+++ b/Makefile
@@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG
 	endif
 else
 	MK_CPPFLAGS   += -DNDEBUG
-	MK_CFLAGS     += -O3
-	MK_CXXFLAGS   += -O3
-	MK_NVCCFLAGS  += -O3
+	MK_CFLAGS     += -O3 -g
+	MK_CXXFLAGS   += -O3 -g
+	MK_NVCCFLAGS  += -O3 -g
 endif
 
 ifdef LLAMA_SANITIZE_THREAD
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index c8a3016a4..37d30ab8c 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                     } else if (type == GGML_TYPE_I8) {
                         v = (float) *(int8_t *) &data[i];
                     } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
                     printf("%12.4f", v);
                     sum += v;
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 574f5ed9c..6ce1863cf 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         }
         else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
             fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
-            exit(1); //GGML_ASSERT(false);
+            exit(1); //GGML_ABORT("fatal error");
         }
         if (m_params.verbosity > 1) {
             printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         }
         else if (e.values.size() != (size_t)src1->ne[0]) {
             fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
-            exit(1); //GGML_ASSERT(false);
+            exit(1); //GGML_ABORT("fatal error");
         }
         ++e.ncall;
         if (m_params.verbosity > 1) {
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index a6497b6e0..521fa8880 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
         case JSON:     return "json";
         case MARKDOWN: return "md";
         case SQL:      return "sql";
-        default: GGML_ASSERT(!"invalid output format");
+        default: GGML_ABORT("invalid output format");
     }
 }
 
@@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
         case LLAMA_SPLIT_MODE_NONE:  return "none";
         case LLAMA_SPLIT_MODE_LAYER: return "layer";
         case LLAMA_SPLIT_MODE_ROW:   return "row";
-        default: GGML_ASSERT(!"invalid split mode");
+        default: GGML_ABORT("invalid split mode");
     }
 }
 
@@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
         case SQL:
             return std::unique_ptr<printer>(new sql_printer());
     }
-    GGML_ASSERT(false);
+    GGML_ABORT("fatal error");
 }
 
 int main(int argc, char ** argv) {
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index d23e282fb..7cda5f10c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             embeddings = peg_0;
         }
         else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index 2afb6024c..17f5e4961 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
                 printf(">");
                 return;
             }
-            GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
+            GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
         }
 
         LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 548661b9b..464d765da 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -254,18 +254,8 @@
 
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fflush(stdout); \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            ggml_print_backtrace(); \
-            abort(); \
-        } \
-    } while (0)
-
 #ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
 #elif defined(__GNUC__)
 #define GGML_UNREACHABLE() __builtin_unreachable()
 #elif defined(_MSC_VER)
@@ -274,6 +264,17 @@
 #define GGML_UNREACHABLE() ((void) 0)
 #endif
 
+#ifdef __cplusplus
+#define GGML_NORETURN [[noreturn]]
+#elif defined(_MSC_VER)
+#define GGML_NORETURN __declspec(noreturn)
+#else
+#define GGML_NORETURN _Noreturn
+#endif
+
+#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -322,6 +323,9 @@
 extern "C" {
 #endif
 
+    GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
+    GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+
     enum ggml_status {
         GGML_STATUS_ALLOC_FAILED = -2,
         GGML_STATUS_FAILED = -1,
@@ -636,8 +640,11 @@ extern "C" {
         GGML_CGRAPH_EVAL_ORDER_COUNT
     };
 
+    typedef uint32_t ggml_bitset_t;
+
     struct ggml_hash_set {
         size_t size;
+        ggml_bitset_t * used;
         struct ggml_tensor ** keys;
     };
 
@@ -651,7 +658,7 @@ extern "C" {
         struct ggml_tensor ** grads;
         struct ggml_tensor ** leafs;
 
-        struct ggml_hash_set visited_hash_table;
+        struct ggml_hash_set visited_hash_set;
 
         enum ggml_cgraph_eval_order order;
     };
@@ -698,8 +705,6 @@ extern "C" {
     GGML_API int64_t ggml_cycles(void);
     GGML_API int64_t ggml_cycles_per_ms(void);
 
-    GGML_API void    ggml_print_backtrace(void);
-
     // accepts a UTF-8 path, even on Windows
     GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
 
@@ -2005,8 +2010,8 @@ extern "C" {
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan            (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
     GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index e176b883e..e485326ab 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
         fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
-        GGML_ASSERT(!"not enough space in the buffer");
-        return;
+        GGML_ABORT("not enough space in the buffer");
     }
 
     void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
             return;
         }
     }
-    GGML_ASSERT(!"out of allocated_tensors");
+    GGML_ABORT("out of allocated_tensors");
 }
 static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
     for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
             return;
         }
     }
-    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
-    GGML_ASSERT(!"tensor not found");
+    GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
 }
 #endif
 
@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
             // this should never happen
             fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                     __func__, size, max_avail);
-            GGML_ASSERT(!"not enough space in the buffer");
-            GGML_UNREACHABLE();
+            GGML_ABORT("not enough space in the buffer");
         }
     }
 
@@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
         }
     }
 
-    free(galloc->hash_set.keys);
+    ggml_hash_set_free(&galloc->hash_set);
     free(galloc->hash_values);
     free(galloc->bufts);
     free(galloc->buffers);
@@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 typedef struct ggml_gallocr * ggml_gallocr_t;
 
 static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
+    size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
     return &galloc->hash_values[i];
 }
 
@@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
 
 static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
     // clear hash tables
-    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
-    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
+    ggml_hash_set_reset(&galloc->hash_set);
+    memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
 
     // allocate leafs
     // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
 }
 
 bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
-    size_t hash_size = graph->visited_hash_table.size;
+    size_t min_hash_size = graph->n_nodes + graph->n_leafs;
+    // add 25% margin to avoid hash collisions
+    min_hash_size += min_hash_size / 4;
 
     // initialize hash table
-    if (galloc->hash_set.size < hash_size) {
-        free(galloc->hash_set.keys);
-        free(galloc->hash_values);
-        galloc->hash_set.size = hash_size;
-        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
-        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
+    if (galloc->hash_set.size < min_hash_size) {
+        ggml_hash_set_free(&galloc->hash_set);
+        galloc->hash_set = ggml_hash_set_new(min_hash_size);
         GGML_ASSERT(galloc->hash_set.keys != NULL);
+
+        free(galloc->hash_values);
+        galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
         GGML_ASSERT(galloc->hash_values != NULL);
-    } else {
-        // reset hash table
-        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
-        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
     }
 
     // reset allocators
@@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }
 
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
+    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
     return talloc->size_max >= node_size;
 }
 
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index d39cfed88..954ab2072 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -1055,11 +1055,10 @@ struct ggml_backend_sched {
     ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
     ggml_gallocr_t galloc;
 
-    // hash keys of the nodes in the graph
-    struct ggml_hash_set    hash_set;
-    // hash values
-    int * tensor_backend_id;
-    struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
+    // hash map of the nodes in the graph
+    struct ggml_hash_set  hash_set;
+    int                 * hv_tensor_backend_ids; // [hash_set.size]
+    struct ggml_tensor ** hv_tensor_copies;      // [hash_set.size][n_backends][n_copies]
 
     int * node_backend_ids; // [graph_size]
     int * leaf_backend_ids; // [graph_size]
@@ -1068,7 +1067,7 @@ struct ggml_backend_sched {
     int * prev_leaf_backend_ids; // [graph_size]
 
     // copy of the graph with modified inputs
-    struct ggml_cgraph * graph;
+    struct ggml_cgraph graph;
 
     // graph splits
     struct ggml_backend_sched_split * splits;
@@ -1087,19 +1086,16 @@ struct ggml_backend_sched {
     ggml_backend_sched_eval_callback callback_eval;
     void * callback_eval_user_data;
 
-    bool debug;
+    char * context_buffer;
+    size_t context_buffer_size;
 
-    // align context_buffer to GGML_MEM_ALIGN
-#ifdef _MSC_VER
-    __declspec(align(GGML_MEM_ALIGN))
-#else
-    __attribute__((aligned(GGML_MEM_ALIGN)))
-#endif
-    char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
+    bool debug;
 };
 
-#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
-#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
+#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
+#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
+#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
+#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
 
 // returns the priority of the backend, lower id is higher priority
 static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
         return cur_backend_id;
     }
 
-    // assign nodes that use weights to the backend of the weights
     // operations with weights are preferably run on the same backend as the weights
     for (int i = 0; i < GGML_MAX_SRC; i++) {
         const struct ggml_tensor * src = tensor->src[i];
@@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
     sched->is_reset = false;
 
     struct ggml_init_params params = {
-        /* .mem_size =   */ sizeof(sched->context_buffer),
+        /* .mem_size =   */ sched->context_buffer_size,
         /* .mem_buffer = */ sched->context_buffer,
         /* .no_alloc =   */ true
     };
@@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
     sched->ctx = ggml_init(params);
     if (sched->ctx == NULL) {
-        fprintf(stderr, "%s: failed to initialize context\n", __func__);
-        GGML_ASSERT(false);
+        GGML_ABORT("%s: failed to initialize context\n", __func__);
     }
 
     // pass 1: assign backends to ops with pre-allocated inputs
     for (int i = 0; i < graph->n_leafs; i++) {
         struct ggml_tensor * leaf = graph->leafs[i];
         int * leaf_backend_id = &tensor_backend_id(leaf);
-        if (*leaf_backend_id != -1) {
-            // do not overwrite user assignments
-            continue;
+        // do not overwrite user assignments
+        if (*leaf_backend_id == -1) {
+            *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
         }
-        *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
     }
 
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         int * node_backend_id = &tensor_backend_id(node);
-        if (*node_backend_id != -1) {
-            // do not overwrite user assignments
-            continue;
-        }
-        *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
-        // src
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
+        // do not overwrite user assignments
+        if (*node_backend_id == -1) {
+            *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
+
+#if 0
+            // src
+            if (node->op == GGML_OP_NONE) {
                 continue;
             }
-            int * src_backend_id = &tensor_backend_id(src);
-            if (*src_backend_id == -1) {
-                *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
+
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    continue;
+                }
+                int * src_backend_id = &tensor_backend_id(src);
+                if (*src_backend_id == -1) {
+                    *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
+                }
             }
+#endif
         }
     }
 
@@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         }
     }
 
-    // pass 4: split graph, find tensors that need to be copied
+    // pass 5: split graph, find tensors that need to be copied
     {
         int i_split = 0;
         struct ggml_backend_sched_split * split = &sched->splits[0];
         // find the backend of the first split, skipping view ops
-        for (int i = 0; i < graph->n_nodes; i++) {
+        int i = 0;
+        for (; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
             if (!ggml_is_view_op(node->op)) {
                 split->backend_id = tensor_backend_id(node);
@@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         }
         split->i_start = 0;
         split->n_inputs = 0;
-        memset(split->inputs, 0, sizeof(split->inputs)); //HACK
         int cur_backend_id = split->backend_id;
-        for (int i = 0; i < graph->n_nodes; i++) {
+        for (; i < graph->n_nodes; i++) {
             struct ggml_tensor * node = graph->nodes[i];
 
             if (ggml_is_view_op(node->op)) {
@@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
             const int node_backend_id = tensor_backend_id(node);
 
-            GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
+            assert(node_backend_id != -1); // all nodes should be assigned by now
 
             // check if we should start a new split based on the sources of the current node
             bool need_new_split = false;
@@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     // by starting a new split, the memory of the previously offloaded weights can be reused
                     if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
                         int src_backend_id = tensor_backend_id(src);
-                        if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
+                        if (src_backend_id != cur_backend_id) {
                             need_new_split = true;
                             break;
                         }
@@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     // FIXME: count the number of inputs instead of only checking when full
                     if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
                         const size_t id = hash_id(src);
-                        int src_backend_id = sched->tensor_backend_id[id];
+                        int src_backend_id = sched->hv_tensor_backend_ids[id];
                         bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
-                        if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
+                        if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
                             //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                             need_new_split = true;
                             break;
@@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     continue;
                 }
 
-                const int src_backend_id = tensor_backend_id(src);
+                size_t src_id = hash_id(src);
+                const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
                 assert(src_backend_id != -1); // all inputs should be assigned by now
 
                 if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
-                    size_t id = hash_id(src);
-                    if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
+                    if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
                         ggml_backend_t backend = sched->backends[src_backend_id];
                         for (int c = 0; c < sched->n_copies; c++) {
                             struct ggml_tensor * tensor_copy;
@@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                                 ggml_set_input(tensor_copy);
                                 ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
                             }
-                            sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
+                            tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
                             SET_CAUSE(tensor_copy, "4.cpy");
                         }
                         int n_graph_inputs = sched->n_graph_inputs++;
@@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                     }
                 }
 
-                bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
-                if (src_backend_id != cur_backend_id && !supported) {
+                if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
                     // create a copy of the input in the split's backend
-                    const size_t id = hash_id(src);
-                    if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
+                    if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
                         ggml_backend_t backend = sched->backends[cur_backend_id];
                         for (int c = 0; c < sched->n_copies; c++) {
                             struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                                 ggml_set_input(tensor_copy);
                                 ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
                             }
-                            sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
+                            tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
                             SET_CAUSE(tensor_copy, "4.cpy");
                         }
                         int n_inputs = split->n_inputs++;
                         GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
                         split->inputs[n_inputs] = src;
                     }
-                    node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
+                    node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
                 }
             }
         }
@@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         ggml_backend_sched_print_assignments(sched, graph);
     }
 
-    // swap node_backend_ids and leaf_backend_ids and prevs
+    // swap node_backend_ids and leaf _backend_ids with prevs
     {
         int * tmp = sched->node_backend_ids;
         sched->node_backend_ids = sched->prev_node_backend_ids;
@@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         sched->prev_leaf_backend_ids = tmp;
     }
 
-    // create copies of the graph for each split
-    // TODO: avoid this copy
-    struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
+    int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
+    if (sched->graph.size < graph_size) {
+        sched->graph.size = graph_size;
+        sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
+        sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
+        GGML_ASSERT(sched->graph.nodes != NULL);
+        GGML_ASSERT(sched->graph.leafs != NULL);
+    }
+    sched->graph.n_nodes = 0;
+    sched->graph.n_leafs = 0;
+
+    struct ggml_cgraph * graph_copy = &sched->graph;
+
     for (int i = 0; i < sched->n_splits; i++) {
         struct ggml_backend_sched_split * split = &sched->splits[i];
         split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
@@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
             struct ggml_tensor * input = split->inputs[j];
             const size_t input_id = hash_id(input);
-            struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
+            struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
 
             // add a dependency to the input source so that it is not freed before the copy is done
             struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
             input_dep->src[0] = input;
-            sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
+            sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
             graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
 
             // add a dependency to the input copy so that it is allocated at the start of the split
@@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
             size_t id = hash_id(input);
             int backend_id = tensor_backend_id(input);
             for (int c = 0; c < sched->n_copies; c++) {
-                struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
+                struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
                 sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
                 graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
             }
@@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                 struct ggml_tensor * input = split->inputs[j];
                 size_t id = hash_id(input);
                 for (int c = 0; c < sched->n_copies; c++) {
-                    struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
+                    struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
                     sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
                     graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
                 }
@@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
         sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
         graph_copy->leafs[graph_copy->n_leafs++] = leaf;
     }
-
-    sched->graph = graph_copy;
 }
 
 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     bool backend_ids_changed = false;
-    for (int i = 0; i < sched->graph->n_nodes; i++) {
+    for (int i = 0; i < sched->graph.n_nodes; i++) {
         if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
             sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
             backend_ids_changed = true;
@@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
         }
     }
     if (!backend_ids_changed) {
-        for (int i = 0; i < sched->graph->n_leafs; i++) {
+        for (int i = 0; i < sched->graph.n_leafs; i++) {
             if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
                 sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
                 backend_ids_changed = true;
@@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     }
 
     // allocate graph
-    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
         // the re-allocation may cause the split inputs to be moved to a different address
         ggml_backend_sched_synchronize(sched);
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
+        fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
-        ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
-        if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+        ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
+        if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
             fprintf(stderr, "%s: failed to allocate graph\n", __func__);
             return false;
         }
@@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
         for (int j = 0; j < split->n_inputs; j++) {
             ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
             struct ggml_tensor * input = split->inputs[j];
-            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
+            struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
 
             if (input->flags & GGML_TENSOR_FLAG_INPUT) {
                 // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
@@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
     struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
 
     sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+    sched->n_backends = n_backends;
+    sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
 
     // initialize hash table
-    sched->hash_set          = ggml_hash_set_new(graph_size);
-    sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
-    sched->tensor_copies     = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
+    // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
+    sched->hash_set    = ggml_hash_set_new(graph_size);
+    sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+    sched->hv_tensor_copies      = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
 
     const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
-    sched->node_backend_ids  = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
-    sched->leaf_backend_ids  = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
+    sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
+    sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
     sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
     sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
 
-    sched->n_backends = n_backends;
-
-    sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
+    sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
+    sched->context_buffer = malloc(sched->context_buffer_size);
 
     const int initial_splits_capacity = 16;
     sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
@@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
     }
     ggml_gallocr_free(sched->galloc);
     ggml_free(sched->ctx);
+    ggml_hash_set_free(&sched->hash_set);
     free(sched->splits);
-    free(sched->hash_set.keys);
-    free(sched->tensor_backend_id);
-    free(sched->tensor_copies);
+    free(sched->hv_tensor_backend_ids);
+    free(sched->hv_tensor_copies);
     free(sched->node_backend_ids);
     free(sched->leaf_backend_ids);
     free(sched->prev_node_backend_ids);
     free(sched->prev_leaf_backend_ids);
+    free(sched->context_buffer);
+    free(sched->graph.nodes);
+    free(sched->graph.leafs);
     free(sched);
 }
 
 void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     // reset state for the next run
     if (!sched->is_reset) {
-        size_t hash_size = sched->hash_set.size;
-        memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
-        memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
-        memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
-
+        ggml_hash_set_reset(&sched->hash_set);
+        memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
+        memset(sched->hv_tensor_copies,       0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
         sched->is_reset = true;
     }
     sched->is_alloc = false;
 }
 
 bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
-    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
+    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
 
     ggml_backend_sched_split_graph(sched, measure_graph);
 
-    // TODO: extract this to a separate function
-    if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
+    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
         return false;
     }
 
@@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
 }
 
 bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
+    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
 
     ggml_backend_sched_split_graph(sched, graph);
 
+
     if (!ggml_backend_sched_alloc_splits(sched)) {
         return false;
     }
@@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     tensor_backend_id(node) = backend_index;
     SET_CAUSE(node, "usr");
+    sched->is_reset = false;
 }
 
 ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
     GGML_ASSERT(src != NULL);
     GGML_ASSERT(src->data && "graph must be allocated");
 
-    size_t id = ggml_hash_insert(hash_set, src);
-    if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
-        return node_copies[ggml_hash_find(hash_set, src)];
+    size_t id = ggml_hash_insert(&hash_set, src);
+    if (id == GGML_HASHSET_ALREADY_EXISTS) {
+        return node_copies[ggml_hash_find(&hash_set, src)];
     }
 
     struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
@@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
     return dst;
 }
 
-static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
+static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
     size_t id = ggml_hash_find(hash_set, src);
     if (node_init[id]) {
         return;
@@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
 }
 
 struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
-    struct ggml_hash_set hash_set = {
-        /* .size = */ graph->visited_hash_table.size,
-        /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
-    };
+    struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
     struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
     bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
 
@@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
 
     if (ctx_allocated == NULL || ctx_unallocated == NULL) {
         fprintf(stderr, "failed to allocate context for graph copy\n");
-        free(hash_set.keys);
+        ggml_hash_set_free(&hash_set);
         free(node_copies);
         free(node_init);
         ggml_free(ctx_allocated);
@@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
     if (buffer == NULL) {
         fprintf(stderr, "failed to allocate buffer for graph copy\n");
-        free(hash_set.keys);
+        ggml_hash_set_free(&hash_set);
         free(node_copies);
         free(node_init);
         ggml_free(ctx_allocated);
@@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     // copy data and init views
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        graph_copy_init_tensor(hash_set, node_copies, node_init, node);
+        graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
     }
 
     // build graph copy
     struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
+        struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
         graph_copy->nodes[i] = node_copy;
     }
     graph_copy->n_nodes = graph->n_nodes;
 
-    free(hash_set.keys);
+    ggml_hash_set_free(&hash_set);
     free(node_copies);
     free(node_init);
 
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index a37aa4072..713731735 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
                 break;
 
             default:
-                fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
-                GGML_ASSERT(false);
+                GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
         }
     }
 
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index 9bf7e332a..ad5feea05 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
             file, line);
     GGML_CANN_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ASSERT to get a stack trace
-    GGML_ASSERT(!"CANN error");
+    GGML_ABORT("CANN error");
 }
 
 /**
@@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
         // memory should always buffered. these memory may still needed by
         // tasks in stream.
         // TODO, fix me.
-        GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
+        GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
     }
 };
 
@@ -1874,7 +1874,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
         ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
                                        (aclrtEvent)event->context));
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index a02efc828..f27666970 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -844,7 +844,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
             ggml_cann_max_pool2d(ctx, dst);
             break;
         case GGML_OP_POOL_COUNT:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
@@ -931,9 +931,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                         ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
         if (dst->type == GGML_TYPE_F32) {
             if (ggml_are_same_shape(src, dst)) {
@@ -955,12 +955,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                         ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
         // TODO
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     } else if (src->type == GGML_TYPE_F32) {
         // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
         //          && nb0 == type_size)
@@ -991,10 +991,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                         ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             } else {
                 // TODO: dst not contiguous
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
         }
         if (dst->type == GGML_TYPE_F16) {
@@ -1017,11 +1017,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                         ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
         }
         // TODO
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     } else {
         if (ggml_are_same_shape(src, dst)) {
             cann_copy(ctx, acl_src, acl_dst);
@@ -1029,7 +1029,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
             ACL_CHECK(aclDestroyTensor(acl_dst));
             return;
         }
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -2219,7 +2219,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                 ((ggml_tensor*)dst->extra)->nb);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
@@ -2492,7 +2492,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
             ggml_cann_mul_mat_q8_0(ctx, dst);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index e48269e46..54ccf6bb1 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     GGML_CUDA_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_CUDA_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ASSERT to get a stack trace
-    GGML_ASSERT(!"CUDA error");
+    GGML_ABORT("CUDA error");
 }
 
 // this is faster on Windows
@@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat(
                     CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
                                 src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                 } else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
 
                 if (quantize_src1 && !src1_is_contiguous) {
@@ -2945,7 +2945,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
 
         CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
 #endif
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 15757ca18..607ded855 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
     } else if (order == GGML_SORT_ORDER_DESC) {
         k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
index 19b08b74f..34bc67acd 100644
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
     } else {
         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
             ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 1c2e00c1e..eac026f47 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -348,7 +348,7 @@ static __device__ void no_device_code(
 #ifdef __CUDA_ARCH__
 #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
 #else
-#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__
 
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 3db57034b..aad34bfe5 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu
index 174489e06..d7a2a2513 100644
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
             convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index f24312dd0..950fd93df 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) {
         fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
         fprintf(stderr, "By default only f16 KV cache is supported.\n");
         fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     } else if (D == 128) {
         fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
         fprintf(stderr, "Supported combinations:\n");
@@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) {
         fprintf(stderr, "  - K == q8_0, V == q8_0,  8.50 BPV\n");
         fprintf(stderr, "  - K == f16,  V == f16,  16.00 BPV\n");
         fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     } else {
         fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
         fprintf(stderr, "Only f16 is supported.\n");
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
index c6c35134d..1b2fd500b 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
         } break;
         default: {
-            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
         } break;
     }
 }
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
index 15e22f495..f3e68dbfa 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
         } break;
         default: {
-            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
+            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
         } break;
     }
 }
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index 38d30b210..29f608b0f 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
                     ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
                     break;
                 default:
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                     break;
             }
         } else {
@@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
                 //     ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                 //     break;
                 default:
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                     break;
             }
         }
@@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
                 ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
                 break;
             default:
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
                 break;
         }
         return;
@@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
                 ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
                 break;
             default:
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
                 break;
         }
         return;
@@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
             ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu
index 55af195fd..4c3703238 100644
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
             break;
         default:
             // TODO: k-quants
-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-            GGML_ASSERT(false);
+            GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
             break;
     }
 }
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 84f6387e2..78d70cd7a 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q(
             mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index f08a4758d..e8a957447 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
         case GGML_TYPE_IQ4_NL:
             return MMQ_Q8_1_DS_LAYOUT_D4;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
@@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
             break;
         default:
             fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index e22faf69b..7dbbc9939 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda(
                 rows_per_cuda_block = 2;
                 break;
             default:
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
                 break;
         }
     }
@@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda(
             mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
@@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q(
             mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index aa7f1eff0..45408ce86 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda(
                 <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 596fb7c13..99ec1dd98 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
                 attn_factor, corr_dims, freq_factors, stream
             );
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     } else {
         if (src0->type == GGML_TYPE_F32) {
@@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
                 attn_factor, corr_dims, freq_factors, stream
             );
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 }
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index a2c8dbec0..7f7afdbfc 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -634,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 
-#define GGML_HASHTABLE_FULL ((size_t)-1)
-#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
+// bitset
+
+static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
+#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
+#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
+
+static size_t ggml_bitset_size(size_t n) {
+    return (n + BITSET_MASK) >> BITSET_SHR;
+}
+
+static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
+    return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
+}
+
+static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
+    bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
+}
+
+static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
+    bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
+}
+
+// hash set
+
+#define GGML_HASHSET_FULL ((size_t)-1)
+#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
 
 struct ggml_hash_set ggml_hash_set_new(size_t size);
+void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);
 
-bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
+// returns the minimum size for a hash set that can hold min_sz elements
+size_t ggml_hash_size(size_t min_sz);
 
-// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
-size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
+// remove all elements from the hash set
+void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
 
-// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
-size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
+// returns true if key is in the hash set
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
 
 // return index, asserts if table is full
-size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+
+// hash function for ggml_tensor
+static inline size_t ggml_hash(const struct ggml_tensor * p) {
+    // the last 4 bits are always zero due to alignment
+    return (size_t)(uintptr_t)p >> 4;
+}
+
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
+        i = (i + 1) % hash_set->size;
+        if (i == h) {
+            // visited all hash table entries -> not found
+            return GGML_HASHSET_FULL;
+        }
+    }
+    return i;
+}
+
+static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t i = ggml_hash_find(hash_set, key);
+    return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
+}
+
+static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    do {
+        if (!ggml_bitset_get(hash_set->used, i)) {
+            ggml_bitset_set(hash_set->used, i);
+            hash_set->keys[i] = key;
+            return i;
+        }
+        if (hash_set->keys[i] == key) {
+            return GGML_HASHSET_ALREADY_EXISTS;
+        }
+        i = (i + 1) % hash_set->size;
+    } while (i != h);
+
+    // visited all hash table entries -> not found
+    GGML_ABORT("fatal error");
+}
+
+static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+    size_t h = ggml_hash(key) % hash_set->size;
+
+    // linear probing
+    size_t i = h;
+    do {
+        if (!ggml_bitset_get(hash_set->used, i)) {
+            ggml_bitset_set(hash_set->used, i);
+            hash_set->keys[i] = key;
+            return i;
+        }
+        if (hash_set->keys[i] == key) {
+            return i;
+        }
+        i = (i + 1) % hash_set->size;
+    } while (i != h);
+
+    // visited all hash table entries -> not found
+    GGML_ABORT("fatal error");
+}
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index ed5f2e349..41ac63fa4 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
     }
     if ((a % b) != 0) {
         fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
-        GGML_ASSERT(!"safe_divide result would've had remainder");
+        GGML_ABORT("safe_divide result would've had remainder");
     }
     return a / b;
 }
@@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
 
             if (!ggml_vk_supports_op(dst)) {
                  fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                 GGML_ASSERT(!"unsupported op");
+                 GGML_ABORT("unsupported op");
             }
 
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
@@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                             default:
                                 {
                                     fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                    GGML_ASSERT(false);
+                                    GGML_ABORT("fatal error");
                                 }
                         }
                     } break;
@@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
             continue;
             not_implemented: {}
             fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-            //GGML_ASSERT(false);
+            //GGML_ABORT("fatal error");
         }
 
         // Evaluate sequence
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index a7619bcca..48b813131 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute(
         NSError * error = nil;
         if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
             GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
-            GGML_ASSERT(!"capture failed");
+            GGML_ABORT("capture failed");
         }
     }
 
@@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
             if (!ggml_metal_supports_op(ctx, dst)) {
                 GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                GGML_ASSERT(!"unsupported op");
+                GGML_ABORT("unsupported op");
             }
 
             if (should_capture) {
@@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                 case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
                                 case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
                                 case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
-                                default: GGML_ASSERT(false);
+                                default: GGML_ABORT("fatal error");
                             }
 
                             bcast_row = true;
@@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                 case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
                                 case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
                                 case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                                default: GGML_ASSERT(false);
+                                default: GGML_ABORT("fatal error");
                             }
                         }
 
@@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute(
                             case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
                             case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
                             case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
-                            default: GGML_ASSERT(false);
+                            default: GGML_ABORT("fatal error");
                         }
 
                         [encoder setComputePipelineState:pipeline];
@@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
                         default:
                             {
                                 GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                GGML_ASSERT(false);
+                                GGML_ABORT("fatal error");
                             }
                     } break;
                 case GGML_OP_SQR:
@@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                 case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32  ].pipeline; break;
                                 case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
                                 case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
-                                default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+                                default: GGML_ABORT("MUL MAT-MAT not implemented");
                             }
 
                             [encoder setComputePipelineState:pipeline];
@@ -1782,7 +1782,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                 default:
                                     {
                                         GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
-                                        GGML_ASSERT(false && "not implemented");
+                                        GGML_ABORT("not implemented");
                                     }
                             };
 
@@ -1911,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                 case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32  ].pipeline; break;
                                 case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
                                 case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
-                                default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
+                                default: GGML_ABORT("MUL_MAT_ID not implemented");
                             }
 
                             [encoder setComputePipelineState:pipeline];
@@ -2078,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                 default:
                                     {
                                         GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
-                                        GGML_ASSERT(false && "not implemented");
+                                        GGML_ABORT("not implemented");
                                     }
                             };
 
@@ -2178,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute(
                             case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
                             case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
                             case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
-                            default: GGML_ASSERT(false && "not implemented");
+                            default: GGML_ABORT("not implemented");
                         }
 
                         [encoder setComputePipelineState:pipeline];
@@ -2316,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute(
                             switch (src0->type) {
                                 case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
                                 case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
+                                default: GGML_ABORT("fatal error");
                             };
                         } else {
                             switch (src0->type) {
                                 case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
                                 case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
+                                default: GGML_ABORT("fatal error");
                             };
                         }
 
@@ -2399,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute(
                         switch (dst->type) {
                             case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
                             case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
-                            default: GGML_ASSERT(false);
+                            default: GGML_ABORT("fatal error");
                         };
 
                         [encoder setComputePipelineState:pipeline];
@@ -2556,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute(
                         switch (order) {
                             case GGML_SORT_ORDER_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
                             case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
-                            default: GGML_ASSERT(false);
+                            default: GGML_ABORT("fatal error");
                         };
 
                         [encoder setComputePipelineState:pipeline];
@@ -2645,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                           {
                                               GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
                                               GGML_METAL_LOG_ERROR("add template specialization for this size\n");
-                                              GGML_ASSERT(false && "add template specialization for this size");
+                                              GGML_ABORT("add template specialization for this size");
                                           }
                             }
                         } else {
@@ -2658,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                           {
                                               GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
                                               GGML_METAL_LOG_ERROR("add template specialization for this size\n");
-                                              GGML_ASSERT(false && "add template specialization for this size");
+                                              GGML_ABORT("add template specialization for this size");
                                           }
                             }
                         }
@@ -2779,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                         case GGML_TYPE_Q5_0:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
                                         case GGML_TYPE_Q5_1:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
                                         case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
-                                        default: GGML_ASSERT(false && "not implemented");
+                                        default: GGML_ABORT("not implemented");
                                     };
                                 } break;
                             case GGML_TYPE_F16:
@@ -2787,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute(
                                     switch (dstt) {
                                         case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
                                         case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
-                                        default: GGML_ASSERT(false && "not implemented");
+                                        default: GGML_ABORT("not implemented");
                                     };
                                 } break;
-                            default: GGML_ASSERT(false && "not implemented");
+                            default: GGML_ABORT("not implemented");
                         }
 
                         [encoder setComputePipelineState:pipeline];
@@ -2818,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute(
                 default:
                     {
                         GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
             }
 
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 47418597c..1c6c85aac 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -12692,7 +12692,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
                     printf("Oops: found point %u not on grid:", u);
                     for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
                     printf("\n");
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
                 q2[2*ib+1] |= (block_signs[k] << 7*k);
@@ -12871,7 +12871,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
                     printf("Oops: found point %u not on grid:", u);
                     for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
                     printf("\n");
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 q2[2*ib+k] = grid_index | (block_signs[k] << 9);
             }
@@ -13314,7 +13314,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
                     printf("Oops: found point %u not on grid:", u);
                     for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
                     printf("\n");
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 if (grid_size == 256) {
                     q3[8*ib+k] = grid_index;
@@ -13527,7 +13527,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
                     printf("Oops: found point %u not on grid:", u);
                     for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
                     printf("\n");
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 qs[k] = grid_index & 255;
                 qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
@@ -14503,7 +14503,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
                     printf("Oops: found point %u not on grid:", u);
                     for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
                     printf("\n");
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 const int i8 = 2*ib + k;
                 y[ibl].qs[i8] = grid_index & 255;
@@ -14623,7 +14623,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
     }
 
     if (nbytes % ggml_type_size(type) != 0) {
-        fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
+        fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
         return false;
     }
 
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 36518ff93..7cb07d0dc 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
                 });
         });
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
         // GGML_SYCL_DEBUG("current device index %d\n", id);
         src_ptr = (char *) extra->data_device[id];
     } else {
-        // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
-        GGML_ASSERT(false);
+        // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n");
+        GGML_ABORT("fatal error");
     }
     char * dst_ptr = (char *) dst;
 
@@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
         default:
             // TODO: k-quants
             fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 }
@@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
     } else {
         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
             ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
         case GGML_TYPE_Q6_K:
             return 64;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
 
 }
@@ -3101,7 +3101,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
                     SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
                                    src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                 } else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
 
                 if (convert_src1_to_q8_1 && !src1_is_contiguous) {
@@ -3896,7 +3896,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     (void) dst;
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 397bd98dd..86d8b40e8 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -100,7 +100,7 @@ static void crash() {
     const char* msg) {
   fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
   fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
-  GGML_ASSERT(!"SYCL error");
+  GGML_ABORT("SYCL error");
 }
 
 #define SYCL_CHECK(err)                     \
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index 70a94fc16..ae45630e1 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
             break;
         default:
             printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 4aaa76bfb..ef4609e32 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -975,7 +975,7 @@ namespace dpct
             if (backend == "opencl:cpu") return 4;
             if (backend == "opencl:acc") return 5;
             printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
         static bool compare_backend(std::string &backend1, std::string &backend2) {
             return convert_backend_index(backend1) < convert_backend_index(backend2);
diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
index 3107ba919..e952533d3 100644
--- a/ggml/src/ggml-sycl/mmq.cpp
+++ b/ggml/src/ggml-sycl/mmq.cpp
@@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q4_0_PASCAL;
         nwarps = NWARPS_Q4_0_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q4_1_PASCAL;
         nwarps = NWARPS_Q4_1_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q5_0_PASCAL;
         nwarps = NWARPS_Q5_0_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q5_1_PASCAL;
         nwarps = NWARPS_Q5_1_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q8_0_PASCAL;
         nwarps = NWARPS_Q8_0_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q2_K_PASCAL;
         nwarps = NWARPS_Q2_K_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q3_K_PASCAL;
         nwarps = NWARPS_Q3_K_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q4_K_PASCAL;
         nwarps = NWARPS_Q4_K_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q5_K_PASCAL;
         nwarps = NWARPS_Q5_K_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
         mmq_y  =  MMQ_Y_Q6_K_PASCAL;
         nwarps = NWARPS_Q6_K_PASCAL;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q(
             ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
     }
 
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 3fbc4dd60..23232357e 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q(
             mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
             break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
             break;
         }
     }
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 6f507941a..c7545bcc1 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -251,7 +251,7 @@ void ggml_sycl_op_rope(
                 attn_factor, corr_dims, freq_factors, main_stream
             );
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     } else {
         if (src0->type == GGML_TYPE_F32) {
@@ -265,7 +265,7 @@ void ggml_sycl_op_rope(
                 attn_factor, corr_dims, freq_factors, main_stream
             );
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 6bcd81a7b..74991f6d1 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -1961,7 +1961,7 @@ void ggml_vk_instance_init() {
         // Make sure at least one device exists
         if (devices.empty()) {
             std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
 
         // Default to using all dedicated GPUs
@@ -2459,7 +2459,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
     // Buffer is already mapped
     if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
         std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
     // Check if src is pinned memory
     vk_buffer buf;
@@ -2527,7 +2527,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
             staging = ctx->device->sync_staging;
             staging_offset = 0;
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 
@@ -2563,7 +2563,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
     // Buffer is already mapped
     if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
         std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
     // Check if src is pinned memory
     vk_buffer buf = nullptr;
@@ -2602,7 +2602,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
             staging_buffer = dst->device->sync_staging;
             staging_offset = 0;
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 
@@ -2704,7 +2704,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
 
             staging_buffer = src->device->sync_staging;
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 
@@ -2913,7 +2913,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
     }
 
     std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
-    GGML_ASSERT(false);
+    GGML_ABORT("fatal error");
 }
 
 static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
@@ -3499,7 +3499,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
     const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
 
     if (mmp == nullptr) {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     // Not implemented
@@ -4078,7 +4078,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
                 std::cerr << " and " << ggml_type_name(src1->type);
             }
             std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
 
         op_func(ctx, subctx, src0, src1, dst);
@@ -4521,7 +4521,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
                 } else if (type == GGML_TYPE_F16) {
                     val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
                 } else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 fprintf(stderr, "% 7.2f ", val);
             } else {
@@ -4555,7 +4555,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
             p = ctx->device->pipeline_matmul_f16->a_s;
             shname = "F16_ALIGNED_S";
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     } else if (shader_size == 1) {
         if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4571,7 +4571,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
             p = ctx->device->pipeline_matmul_f16->a_m;
             shname = "F16_ALIGNED_M";
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     } else if (shader_size == 2) {
         if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4587,7 +4587,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
             p = ctx->device->pipeline_matmul_f16->a_l;
             shname = "F16_ALIGNED_L";
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     } else {
         GGML_ASSERT(0);
@@ -4668,7 +4668,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
         } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
             x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
     for (size_t i = 0; i < y_ne; i++) {
@@ -4679,7 +4679,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
             // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
             y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 
@@ -4727,14 +4727,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
     } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
         src0_type = GGML_TYPE_F16;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
     if (std::is_same<float, Y_TYPE>()) {
         src1_type = GGML_TYPE_F32;
     } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
         src1_type = GGML_TYPE_F16;
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
@@ -4841,7 +4841,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
                 } else if (tensor->type == GGML_TYPE_F16) {
                     val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
                 } else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 fprintf(stderr, "% 7.2f ", val);
             } else {
@@ -5391,7 +5391,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
         std::cerr << std::endl;
     }
 
-    GGML_ASSERT(false);
+    GGML_ABORT("fatal error");
 #endif
 
     if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
@@ -5486,7 +5486,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         break;
     default:
         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
         return;
     }
 
@@ -6498,7 +6498,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
                 } else if (tensor->type == GGML_TYPE_I32) {
                     val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
                 } else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
                 fprintf(stderr, "% 7.2f ", val);
             } else {
@@ -6620,7 +6620,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
                 memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
             }
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
 
         if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@@ -6662,7 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
                 memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
             }
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
 
         if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@@ -6720,7 +6720,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
                 memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
             }
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
 
         if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@@ -6797,7 +6797,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
             break;
         default:
             std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
         if (src1 == nullptr) {
@@ -6825,7 +6825,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
         tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
     } else {
         std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
@@ -6912,7 +6912,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
                         }
                     } else {
                         std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
 
                     if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
@@ -6935,7 +6935,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
                         std::cerr << std::endl;
                         std::vector<const ggml_tensor *> done;
                         ggml_vk_print_graph_origin(tensor, done);
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
                     if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
                         first_error[0] = i0;
@@ -7006,7 +7006,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
         std::cerr << std::endl;
         std::vector<const ggml_tensor *> done;
         ggml_vk_print_graph_origin(tensor, done);
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     } else {
         std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 29afcc7f8..c196fd5bf 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -141,23 +141,25 @@ typedef pthread_t ggml_thread_t;
 
 #include <sys/wait.h>
 
-void ggml_print_backtrace(void) {
-    /*
-    #include <execinfo.h>
-    #include <dlfcn.h>
-
+#if defined(__linux__)
+#include <execinfo.h>
+static void ggml_print_backtrace_symbols(void) {
     void * trace[100];
-
     int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
-
     backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
-    */
+}
+#else
+static void ggml_print_backtrace_symbols(void) {
+    // platform not supported
+}
+#endif
 
-    // backtrack_symbols does not show line numbers, use gdb instead
+static void ggml_print_backtrace(void) {
     char attach[32];
     snprintf(attach, sizeof(attach), "attach %d", getpid());
     int pid = fork();
     if (pid == 0) {
+        // try gdb
         execlp("gdb", "gdb", "--batch",
             "-ex", "set style enabled on",
             "-ex", attach,
@@ -165,16 +167,46 @@ void ggml_print_backtrace(void) {
             "-ex", "detach",
             "-ex", "quit",
             (char *) NULL);
+        // try lldb
+        execlp("lldb", "lldb", "--batch",
+            "-o", "bt",
+            "-o", "quit",
+            "-p", attach,
+            (char *) NULL);
+        exit(EXIT_FAILURE);
     } else {
-        waitpid(pid, NULL, 0);
+        int wstatus;
+        waitpid(pid, &wstatus, 0);
+        if (WIFEXITED(wstatus)) {
+            if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
+                // gdb failed, fallback to backtrace_symbols
+                ggml_print_backtrace_symbols();
+            }
+        }
     }
 }
 #else
-void ggml_print_backtrace(void) {
+static void ggml_print_backtrace(void) {
     // platform not supported
 }
 #endif
 
+void ggml_abort(const char * file, int line, const char * fmt, ...) {
+    fflush(stdout);
+
+    fprintf(stderr, "%s:%d: ", file, line);
+
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+
+    fprintf(stderr, "\n");
+
+    ggml_print_backtrace();
+    abort();
+}
+
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
@@ -246,7 +278,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
                 break;
         }
         GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
         return NULL;
     }
     return aligned_memory;
@@ -267,7 +299,7 @@ inline static void * ggml_malloc(size_t size) {
     void * result = malloc(size);
     if (result == NULL) {
         GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
     return result;
 }
@@ -281,7 +313,7 @@ inline static void * ggml_calloc(size_t num, size_t size) {
     void * result = calloc(num, size);
     if (result == NULL) {
         GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
     return result;
 }
@@ -3372,7 +3404,7 @@ static inline int ggml_up(int n, int m) {
 }
 
 // assert that pointer is aligned to GGML_MEM_ALIGN
-#define ggml_assert_aligned(ptr) \
+#define GGML_ASSERT_ALIGNED(ptr) \
     GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -3473,7 +3505,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_ASSERT(ctx->mem_buffer != NULL);
 
-    ggml_assert_aligned(ctx->mem_buffer);
+    GGML_ASSERT_ALIGNED(ctx->mem_buffer);
 
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
@@ -3605,7 +3637,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
         .type = type,
     };
 
-    ggml_assert_aligned(mem_buffer + obj_new->offs);
+    GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);
 
     if (obj_cur != NULL) {
         obj_cur->next = obj_new;
@@ -3706,7 +3738,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 #endif
 
     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
-    //ggml_assert_aligned(result->data);
+    //GGML_ASSERT_ALIGNED(result->data);
 
     for (int i = 0; i < n_dims; i++) {
         result->ne[i] = ne[i];
@@ -3879,8 +3911,8 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 
     return tensor;
@@ -3938,8 +3970,8 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 
     return tensor;
@@ -4008,11 +4040,9 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
             }
         default:
             {
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
     }
-
-    return 0.0f;
 }
 
 void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
@@ -4055,8 +4085,8 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -4076,10 +4106,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
         case GGML_TYPE_F32:
             return ((float *) data)[0];
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
-
-    return 0.0f;
 }
 
 void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
@@ -4111,8 +4139,8 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -4149,11 +4177,9 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
             }
         default:
             {
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
     }
-
-    return 0.0f;
 }
 
 void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
@@ -4190,8 +4216,8 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -4211,10 +4237,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
         case GGML_TYPE_F32:
             return ((float *) data)[0];
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
-
-    return 0.0f;
 }
 
 void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
@@ -4246,8 +4270,8 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -4270,8 +4294,11 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
 }
 
 struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
-    strncpy(tensor->name, name, sizeof(tensor->name) - 1);
-    tensor->name[sizeof(tensor->name) - 1] = '\0';
+    size_t i;
+    for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) {
+        tensor->name[i] = name[i];
+    }
+    tensor->name[i] = '\0';
     return tensor;
 }
 
@@ -4842,7 +4869,7 @@ struct ggml_tensor * ggml_mean(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement
+        GGML_ABORT("fatal error"); // TODO: implement
         is_node = true;
     }
 
@@ -4865,7 +4892,7 @@ struct ggml_tensor * ggml_argmax(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
         is_node = true;
     }
 
@@ -5188,7 +5215,7 @@ static struct ggml_tensor * ggml_norm_impl(
     bool is_node = false;
 
     if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -5291,7 +5318,7 @@ static struct ggml_tensor * ggml_group_norm_impl(
 
     bool is_node = false;
     if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -5705,7 +5732,7 @@ struct ggml_tensor * ggml_reshape(
 
     if (b->grad) {
         // gradient propagation is not supported
-        //GGML_ASSERT(false);
+        //GGML_ABORT("fatal error");
     }
 
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
@@ -6488,7 +6515,7 @@ struct ggml_tensor * ggml_clamp(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6564,7 +6591,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
     bool is_node = false;
 
     if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6636,7 +6663,7 @@ struct ggml_tensor * ggml_im2col(
     bool is_node = false;
 
     if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6722,7 +6749,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
     bool is_node = false;
 
     if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6763,7 +6790,7 @@ struct ggml_tensor * ggml_pool_1d(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6801,7 +6828,7 @@ struct ggml_tensor * ggml_pool_2d(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6834,7 +6861,7 @@ static struct ggml_tensor * ggml_upscale_impl(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6884,7 +6911,7 @@ struct ggml_tensor * ggml_pad(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -6933,7 +6960,7 @@ struct ggml_tensor * ggml_timestep_embedding(
     bool is_node = false;
 
     if (timesteps->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -7059,7 +7086,7 @@ struct ggml_tensor * ggml_flash_attn_back(
         struct ggml_tensor  * v,
         struct ggml_tensor  * d,
         bool                  masked) {
-    GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
+    GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes");
 
     GGML_ASSERT(ggml_can_mul_mat(k, q));
     // TODO: check if vT can be multiplied by (k*qT)
@@ -7158,7 +7185,7 @@ struct ggml_tensor * ggml_ssm_conv(
     bool is_node = false;
 
     if (s->grad || x->grad || c->grad || sq->grad) {
-        GGML_ASSERT(false); // TODO: implement
+        GGML_ABORT("fatal error"); // TODO: implement
         is_node = true;
     }
 
@@ -7212,7 +7239,7 @@ struct ggml_tensor * ggml_ssm_scan(
     bool is_node = false;
 
     if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
-        GGML_ASSERT(false); // TODO: implement
+        GGML_ABORT("fatal error"); // TODO: implement
         is_node = true;
     }
 
@@ -7244,7 +7271,7 @@ struct ggml_tensor * ggml_win_part(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -7282,7 +7309,7 @@ struct ggml_tensor * ggml_win_unpart(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -7312,7 +7339,7 @@ struct ggml_tensor * ggml_get_rel_pos(
     bool is_node = false;
 
     if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
+        GGML_ABORT("fatal error"); // TODO: implement backward
         is_node = true;
     }
 
@@ -8002,7 +8029,7 @@ static void ggml_compute_forward_dup_f16(
                     }
                 }
             } else {
-                GGML_ASSERT(false); // TODO: implement
+                GGML_ABORT("fatal error"); // TODO: implement
             }
         } else {
             //printf("%s: this is not optimal - fix me\n", __func__);
@@ -8044,7 +8071,7 @@ static void ggml_compute_forward_dup_f16(
                     }
                 }
             } else {
-                GGML_ASSERT(false); // TODO: implement
+                GGML_ABORT("fatal error"); // TODO: implement
             }
         }
         return;
@@ -8161,7 +8188,7 @@ static void ggml_compute_forward_dup_f16(
             }
         }
     } else {
-        GGML_ASSERT(false); // TODO: implement
+        GGML_ABORT("fatal error"); // TODO: implement
     }
 }
 
@@ -8288,7 +8315,7 @@ static void ggml_compute_forward_dup_bf16(
                     }
                 }
             } else {
-                GGML_ASSERT(false); // TODO: implement
+                GGML_ABORT("fatal error"); // TODO: implement
             }
         } else {
             //printf("%s: this is not optimal - fix me\n", __func__);
@@ -8348,7 +8375,7 @@ static void ggml_compute_forward_dup_bf16(
                     }
                 }
             } else {
-                GGML_ASSERT(false); // TODO: implement
+                GGML_ABORT("fatal error"); // TODO: implement
             }
         }
         return;
@@ -8517,7 +8544,7 @@ static void ggml_compute_forward_dup_bf16(
             }
         }
     } else {
-        GGML_ASSERT(false); // TODO: implement
+        GGML_ABORT("fatal error"); // TODO: implement
     }
 }
 
@@ -8603,7 +8630,7 @@ static void ggml_compute_forward_dup_f32(
                     }
                 }
             } else {
-                GGML_ASSERT(false); // TODO: implement
+                GGML_ABORT("fatal error"); // TODO: implement
             }
         } else {
             //printf("%s: this is not optimal - fix me\n", __func__);
@@ -8663,7 +8690,7 @@ static void ggml_compute_forward_dup_f32(
                     }
                 }
             } else {
-                GGML_ASSERT(false); // TODO: implement
+                GGML_ABORT("fatal error"); // TODO: implement
             }
         }
 
@@ -8834,7 +8861,7 @@ static void ggml_compute_forward_dup_f32(
             }
         }
     } else {
-        GGML_ASSERT(false); // TODO: implement
+        GGML_ABORT("fatal error"); // TODO: implement
     }
 }
 
@@ -9012,8 +9039,8 @@ static void ggml_compute_forward_dup(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -9165,7 +9192,7 @@ static void ggml_compute_forward_add_f16_f32(
     }
     else {
         // src1 is not contiguous
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -9240,7 +9267,7 @@ static void ggml_compute_forward_add_bf16_f32(
     }
     else {
         // src1 is not contiguous
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -9292,7 +9319,7 @@ static void ggml_compute_forward_add_f16_f16(
     }
     else {
         // src1 is not contiguous
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -9344,7 +9371,7 @@ static void ggml_compute_forward_add_bf16_bf16(
     }
     else {
         // src1 is not contiguous
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -9438,7 +9465,7 @@ static void ggml_compute_forward_add(
                     ggml_compute_forward_add_f32(params, dst);
                 }
                 else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
             } break;
         case GGML_TYPE_F16:
@@ -9450,7 +9477,7 @@ static void ggml_compute_forward_add(
                     ggml_compute_forward_add_f16_f32(params, dst);
                 }
                 else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
             } break;
         case GGML_TYPE_BF16:
@@ -9462,7 +9489,7 @@ static void ggml_compute_forward_add(
                     ggml_compute_forward_add_bf16_f32(params, dst);
                 }
                 else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
             } break;
         case GGML_TYPE_Q4_0:
@@ -9492,8 +9519,8 @@ static void ggml_compute_forward_add(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -9827,7 +9854,7 @@ static void ggml_compute_forward_add1(
                     ggml_compute_forward_add1_f16_f32(params, dst);
                 }
                 else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
             } break;
         case GGML_TYPE_BF16:
@@ -9839,7 +9866,7 @@ static void ggml_compute_forward_add1(
                     ggml_compute_forward_add1_bf16_f32(params, dst);
                 }
                 else {
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
                 }
             } break;
         case GGML_TYPE_Q4_0:
@@ -9870,8 +9897,8 @@ static void ggml_compute_forward_add1(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -9995,8 +10022,8 @@ static void ggml_compute_forward_acc(
         case GGML_TYPE_Q4_0_8_8:
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10076,8 +10103,8 @@ static void ggml_compute_forward_sub(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10170,8 +10197,8 @@ static void ggml_compute_forward_mul(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10261,8 +10288,8 @@ static void ggml_compute_forward_div(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10306,8 +10333,8 @@ static void ggml_compute_forward_sqr(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10351,8 +10378,8 @@ static void ggml_compute_forward_sqrt(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10396,8 +10423,8 @@ static void ggml_compute_forward_log(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10525,8 +10552,8 @@ static void ggml_compute_forward_sum(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10578,8 +10605,8 @@ static void ggml_compute_forward_sum_rows(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10635,8 +10662,8 @@ static void ggml_compute_forward_mean(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10683,8 +10710,8 @@ static void ggml_compute_forward_argmax(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10801,8 +10828,8 @@ static void ggml_compute_forward_repeat(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10879,8 +10906,8 @@ static void ggml_compute_forward_repeat_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10948,8 +10975,8 @@ static void ggml_compute_forward_concat(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -10992,8 +11019,8 @@ static void ggml_compute_forward_abs(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11036,8 +11063,8 @@ static void ggml_compute_forward_sgn(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11080,8 +11107,8 @@ static void ggml_compute_forward_neg(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11124,8 +11151,8 @@ static void ggml_compute_forward_step(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11168,8 +11195,8 @@ static void ggml_compute_forward_tanh(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11212,8 +11239,8 @@ static void ggml_compute_forward_elu(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11256,8 +11283,8 @@ static void ggml_compute_forward_relu(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11300,8 +11327,8 @@ static void ggml_compute_forward_sigmoid(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11359,8 +11386,8 @@ static void ggml_compute_forward_gelu(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11418,8 +11445,8 @@ static void ggml_compute_forward_gelu_quick(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11477,8 +11504,8 @@ static void ggml_compute_forward_silu(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 // ggml_compute_forward_leaky_relu
@@ -11526,8 +11553,8 @@ static void ggml_compute_forward_leaky_relu(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11589,8 +11616,8 @@ static void ggml_compute_forward_silu_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11631,8 +11658,8 @@ static void ggml_compute_forward_hardswish(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11673,8 +11700,8 @@ static void ggml_compute_forward_hardsigmoid(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11745,8 +11772,8 @@ static void ggml_compute_forward_norm(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11813,8 +11840,8 @@ static void ggml_compute_forward_rms_norm(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -11986,8 +12013,8 @@ static void ggml_compute_forward_rms_norm_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -12080,8 +12107,8 @@ static void ggml_compute_forward_group_norm(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -12839,17 +12866,17 @@ static void ggml_compute_forward_out_prod(
             } break;
         case GGML_TYPE_F16:
             {
-                GGML_ASSERT(false); // todo
+                GGML_ABORT("fatal error"); // todo
                 // ggml_compute_forward_out_prod_f16_f32(params, dst);
-            } break;
+            }
         case GGML_TYPE_F32:
             {
                 ggml_compute_forward_out_prod_f32(params, dst);
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -12908,8 +12935,8 @@ static void ggml_compute_forward_scale(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -13024,8 +13051,8 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_Q4_0_8_8:
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -13302,8 +13329,8 @@ static void ggml_compute_forward_get_rows(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 
     //static bool first = true;
@@ -13410,8 +13437,8 @@ static void ggml_compute_forward_get_rows_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 
     //static bool first = true;
@@ -13488,8 +13515,8 @@ static void ggml_compute_forward_diag(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -13558,8 +13585,8 @@ static void ggml_compute_forward_diag_mask_inf(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -13576,8 +13603,8 @@ static void ggml_compute_forward_diag_mask_zero(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -13694,8 +13721,8 @@ static void ggml_compute_forward_soft_max(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -13790,8 +13817,8 @@ static void ggml_compute_forward_soft_max_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -13881,8 +13908,8 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_F64:
         case GGML_TYPE_COUNT:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -14211,8 +14238,8 @@ static void ggml_compute_forward_rope(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -14235,8 +14262,8 @@ static void ggml_compute_forward_rope_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -14435,8 +14462,8 @@ static void ggml_compute_forward_conv_transpose_1d(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -14607,8 +14634,8 @@ static void ggml_compute_forward_im2col(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -14740,20 +14767,20 @@ static void ggml_compute_forward_pool_1d_sk_p0(
             switch (op) {
                 case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
                 case GGML_OP_POOL_MAX:   drow[i] = -FLT_MAX; break;
-                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
             }
             for (int ki = 0; ki < k; ++ki) {
                 switch (op) {
                     case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
                     case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
-                    case GGML_OP_POOL_COUNT:                        GGML_ASSERT(false); break;
+                    case GGML_OP_POOL_COUNT:                        GGML_ABORT("fatal error");
                 }
                 ++j;
             }
             switch (op) {
                 case GGML_OP_POOL_AVG:         drow[i] /= k; break;
                 case GGML_OP_POOL_MAX:                       break;
-                case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
             }
         }
 
@@ -14822,7 +14849,7 @@ static void ggml_compute_forward_pool_2d(
                 switch (op) {
                     case GGML_OP_POOL_AVG:     *out = 0;        break;
                     case GGML_OP_POOL_MAX:     *out = -FLT_MAX; break;
-                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
                 }
 
                 const int ix = offset0 + ox * s0;
@@ -14837,14 +14864,14 @@ static void ggml_compute_forward_pool_2d(
                         switch (op) {
                             case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
                             case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
-                            case GGML_OP_POOL_COUNT:                GGML_ASSERT(false); break;
+                            case GGML_OP_POOL_COUNT:                GGML_ABORT("fatal error");
                         }
                     }
                 }
                 switch (op) {
                     case GGML_OP_POOL_AVG:           *out /= ka; break;
                     case GGML_OP_POOL_MAX:                       break;
-                    case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
+                    case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
                 }
             }
         }
@@ -14908,8 +14935,8 @@ static void ggml_compute_forward_upscale(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -14966,8 +14993,8 @@ static void ggml_compute_forward_pad(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15007,8 +15034,8 @@ static void ggml_compute_forward_arange(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15058,8 +15085,8 @@ static void ggml_compute_forward_timestep_embedding(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15117,8 +15144,8 @@ static void ggml_compute_forward_argsort(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15340,8 +15367,8 @@ static void ggml_compute_forward_flash_attn_ext(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15676,8 +15703,8 @@ static void ggml_compute_forward_flash_attn_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15798,8 +15825,8 @@ static void ggml_compute_forward_ssm_conv(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15919,8 +15946,8 @@ static void ggml_compute_forward_ssm_scan(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -15982,8 +16009,8 @@ static void ggml_compute_forward_win_part(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16043,8 +16070,8 @@ static void ggml_compute_forward_win_unpart(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16111,8 +16138,8 @@ static void ggml_compute_forward_unary(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16158,8 +16185,8 @@ static void ggml_compute_forward_get_rel_pos(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16239,8 +16266,8 @@ static void ggml_compute_forward_add_rel_pos(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16285,8 +16312,8 @@ static void ggml_compute_forward_map_unary(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16334,8 +16361,8 @@ static void ggml_compute_forward_map_binary(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16533,8 +16560,8 @@ static void ggml_compute_forward_cross_entropy_loss(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16620,8 +16647,8 @@ static void ggml_compute_forward_cross_entropy_loss_back(
             } break;
         default:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
@@ -16956,14 +16983,32 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_COUNT:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static size_t ggml_hash_size(size_t min_sz) {
+struct ggml_hash_set ggml_hash_set_new(size_t size) {
+    size = ggml_hash_size(size);
+    struct ggml_hash_set result;
+    result.size = size;
+    result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
+    result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t));
+    return result;
+}
+
+void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
+    memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size));
+}
+
+void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
+    GGML_FREE(hash_set->used);
+    GGML_FREE(hash_set->keys);
+}
+
+size_t ggml_hash_size(size_t min_sz) {
     // next primes after powers of two
     static const size_t primes[] = {
         2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
@@ -16974,7 +17019,7 @@ static size_t ggml_hash_size(size_t min_sz) {
     };
     static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
 
-    // find the smallest prime that is larger or equal to min_sz
+    // find the smallest prime that is larger or equal than min_sz
     size_t l = 0;
     size_t r = n_primes;
     while (l < r) {
@@ -16989,67 +17034,6 @@ static size_t ggml_hash_size(size_t min_sz) {
     return sz;
 }
 
-static size_t ggml_hash(const void * p) {
-    return (size_t)p;
-}
-
-size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t h = ggml_hash(key) % hash_set.size;
-
-    // linear probing
-    size_t i = h;
-    while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
-        i = (i + 1) % hash_set.size;
-        if (i == h) {
-            // visited all hash table entries -> not found
-            return GGML_HASHTABLE_FULL;
-        }
-    }
-    return i;
-}
-
-bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-    return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
-}
-
-size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
-    if (hash_set.keys[i] == key) {
-        return GGML_HASHTABLE_ALREADY_EXISTS;
-    }
-
-    // insert
-    GGML_ASSERT(hash_set.keys[i] == NULL);
-    hash_set.keys[i] = key;
-    return i;
-}
-
-size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
-    size_t i = ggml_hash_find(hash_set, key);
-
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL);
-
-    hash_set.keys[i] = key;
-    return i;
-}
-
-struct ggml_hash_set ggml_hash_set_new(size_t size) {
-    size = ggml_hash_size(size);
-    struct ggml_hash_set result;
-    result.size = size;
-    result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
-    memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
-    return result;
-}
-
-static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
-    GGML_FREE(hash_set.keys);
-}
-
 struct hash_map {
     struct ggml_hash_set set;
     struct ggml_tensor ** vals;
@@ -17058,13 +17042,12 @@ struct hash_map {
 static struct hash_map * ggml_new_hash_map(size_t size) {
     struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
     result->set = ggml_hash_set_new(size);
-    result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
-    memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
+    result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *));
     return result;
 }
 
 static void ggml_hash_map_free(struct hash_map * map) {
-    ggml_hash_set_free(map->set);
+    ggml_hash_set_free(&map->set);
     GGML_FREE(map->vals);
     GGML_FREE(map);
 }
@@ -17085,7 +17068,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
         return node;
     }
 
-    if (!ggml_hash_contains(graph->visited_hash_table, node)) {
+    if (!ggml_hash_contains(&graph->visited_hash_set, node)) {
         return node;
     }
 
@@ -17100,8 +17083,8 @@ static struct ggml_tensor * ggml_recompute_graph_node(
         return node;
     }
 
-    size_t i = ggml_hash_find(replacements->set, node);
-    GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
+    size_t i = ggml_hash_find(&replacements->set, node);
+    GGML_ASSERT(i != GGML_HASHSET_FULL); // assert that not full
     if (replacements->set.keys[i] == node) {
         return replacements->vals[i];
     }
@@ -17159,8 +17142,8 @@ void ggml_build_backward_gradient_checkpointing(
 
     // insert checkpoints in replacements
     for (int i = 0; i < n_checkpoints; ++i) {
-        size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
-        GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
+        size_t k = ggml_hash_find(&replacements->set, checkpoints[i]);
+        GGML_ASSERT(k != GGML_HASHSET_FULL); // assert that not full
         GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
         replacements->set.keys[k] = checkpoints[i];
         replacements->vals[k]     = checkpoints[i];
@@ -17188,7 +17171,7 @@ void ggml_build_backward_gradient_checkpointing(
 
 // functions to change gradients considering the case that input a might be initial gradient with zero value
 
-static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
     if (ggml_hash_contains(zero_table, a)) {
         return b;
     } else {
@@ -17196,7 +17179,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
     }
 }
 
-static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) {
     if (ggml_hash_contains(zero_table, a)) {
         struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
         return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
@@ -17205,7 +17188,7 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
     }
 }
 
-static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
     if (ggml_hash_contains(zero_table, a)) {
         return ggml_repeat(ctx, b, a);
     } else {
@@ -17213,7 +17196,7 @@ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct g
     }
 }
 
-static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
+static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
     if (ggml_hash_contains(zero_table, a)) {
         return ggml_neg(ctx, b);
     } else {
@@ -17221,7 +17204,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg
     }
 }
 
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
+static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
     struct ggml_tensor * src2 = tensor->src[2];
@@ -17390,8 +17373,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_MEAN:
         case GGML_OP_ARGMAX:
             {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
         case GGML_OP_REPEAT:
             {
                 // necessary for llama
@@ -17414,16 +17397,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_CONCAT:
             {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
+                GGML_ABORT("fatal error"); // TODO: implement
+            }
         case GGML_OP_SILU_BACK:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_NORM:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_RMS_NORM:
             {
                 // necessary for llama
@@ -17439,12 +17422,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_RMS_NORM_BACK:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_GROUP_NORM:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_MUL_MAT:
             {
                 // https://cs231n.github.io/optimization-2/#staged
@@ -17505,12 +17488,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_MUL_MAT_ID:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_OUT_PROD:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_SCALE:
             {
                 // necessary for llama
@@ -17686,12 +17669,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_GET_ROWS_BACK:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_DIAG:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_DIAG_MASK_INF:
             {
                 // necessary for llama
@@ -17729,8 +17712,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_SOFT_MAX_BACK:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_ROPE:
             {
                 // necessary for llama
@@ -17805,52 +17788,52 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_CLAMP:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_IM2COL:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_POOL_1D:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_POOL_2D:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_UPSCALE:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_PAD:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_ARANGE:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_TIMESTEP_EMBEDDING:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_ARGSORT:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_LEAKY_RELU:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_FLASH_ATTN_EXT:
             {
                 struct ggml_tensor * flash_grad = NULL;
@@ -17906,13 +17889,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_FLASH_ATTN_BACK:
             {
-                GGML_ASSERT(false); // not supported
-            } break;
+                GGML_ABORT("fatal error"); // not supported
+            }
         case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
             {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_WIN_PART:
         case GGML_OP_WIN_UNPART:
         case GGML_OP_UNARY:
@@ -17950,12 +17933,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         } break;
                     case GGML_UNARY_OP_TANH:
                         {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
+                            GGML_ABORT("fatal error"); // TODO: not implemented
+                        }
                     case GGML_UNARY_OP_ELU:
                         {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
+                            GGML_ABORT("fatal error"); // TODO: not implemented
+                        }
                     case GGML_UNARY_OP_RELU:
                         {
                             if (src0->grad) {
@@ -17969,16 +17952,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                         } break;
                     case GGML_UNARY_OP_SIGMOID:
                         {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
+                            GGML_ABORT("fatal error"); // TODO: not implemented
+                        }
                     case GGML_UNARY_OP_GELU:
                         {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
+                            GGML_ABORT("fatal error"); // TODO: not implemented
+                        }
                     case GGML_UNARY_OP_GELU_QUICK:
                         {
-                            GGML_ASSERT(false); // TODO: not implemented
-                        } break;
+                            GGML_ABORT("fatal error"); // TODO: not implemented
+                        }
                     case GGML_UNARY_OP_SILU:
                         {
                             // necessary for llama
@@ -17990,7 +17973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             }
                         } break;
                     default:
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                 }
             } break;
         case GGML_OP_GET_REL_POS:
@@ -18004,8 +17987,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_MAP_CUSTOM2:
         case GGML_OP_MAP_CUSTOM3:
             {
-                GGML_ASSERT(false); // not supported
-            } break;
+                GGML_ABORT("fatal error"); // not supported
+            }
         case GGML_OP_CROSS_ENTROPY_LOSS:
             {
                 if (src0->grad) {
@@ -18020,16 +18003,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             } break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             {
-                GGML_ASSERT(false); // not supported
-            } break;
+                GGML_ABORT("fatal error"); // not supported
+            }
         case GGML_OP_NONE:
             {
                 // nop
             } break;
         case GGML_OP_COUNT:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 
     for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -18049,7 +18032,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
     }
 
     // check if already visited
-    if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
+    if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
         return;
     }
 
@@ -18131,7 +18114,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
     struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
     for (int i = 0; i < gf->n_nodes; i++) {
         if (gf->grads[i]) {
-            ggml_hash_insert(zero_table, gf->grads[i]);
+            ggml_hash_insert(&zero_table, gf->grads[i]);
         }
     }
 
@@ -18141,7 +18124,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
         // inplace operations to add gradients are not created by ggml_compute_backward
         // use allocator to automatically make inplace operations
         if (node->grad) {
-            ggml_compute_backward(ctx, node, zero_table);
+            ggml_compute_backward(ctx, node, &zero_table);
         }
     }
 
@@ -18154,16 +18137,29 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
         }
     }
 
-    ggml_hash_set_free(zero_table);
+    ggml_hash_set_free(&zero_table);
+}
+
+static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
+    void * ptr = *p;
+    ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
+    *p = (void *) ((char *) ptr + size);
+    return ptr;
 }
 
 static size_t ggml_graph_nbytes(size_t size, bool grads) {
-    size_t nbytes = sizeof(struct ggml_cgraph);
-    nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
+    size_t hash_size = ggml_hash_size(size * 2);
+    void * p = 0;
+    incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
+    incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
+    incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
+    incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
     if (grads) {
-        nbytes += size * sizeof(struct ggml_tensor *); // grads
+        incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
     }
-    nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
+    incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
+
+    size_t nbytes = (size_t) p;
     return nbytes;
 }
 
@@ -18180,19 +18176,19 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
     struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
 
-    struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
-
+    // the size of the hash table is doubled since it needs to hold both nodes and leafs
     size_t hash_size = ggml_hash_size(size * 2);
-    struct ggml_tensor ** nodes_ptr = data_start;
-    struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
-    struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
-    struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
+
+    void * p = cgraph + 1;
+
+    struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
+    struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
+    ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
 
     // check that we allocated the correct amount of memory
-    assert(obj_size == (size_t) (
-        (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
-
-    memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
+    assert(obj_size == (size_t)((char *)p - (char *)cgraph));
 
     *cgraph = (struct ggml_cgraph) {
         /*.size         =*/ size,
@@ -18201,10 +18197,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
         /*.nodes        =*/ nodes_ptr,
         /*.grads        =*/ grads_ptr,
         /*.leafs        =*/ leafs_ptr,
-        /*.hash_table   =*/ { hash_size, hash_keys_ptr },
+        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
         /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
     };
 
+    ggml_hash_set_reset(&cgraph->visited_hash_set);
+
     return cgraph;
 }
 
@@ -18220,7 +18218,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
         /*.nodes        =*/ cgraph0->nodes + i0,
         /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
         /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL },
+        /*.hash_table   =*/ { 0, NULL, NULL },
         /*.order        =*/ cgraph0->order,
     };
 
@@ -18230,7 +18228,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
 void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
     GGML_ASSERT(dst->size >= src->n_leafs);
     GGML_ASSERT(dst->size >= src->n_nodes);
-    GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
+    GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
 
     dst->n_leafs = src->n_leafs;
     dst->n_nodes = src->n_nodes;
@@ -18251,9 +18249,9 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
         }
     }
 
-    for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
-        if (src->visited_hash_table.keys[i]) {
-            ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
+    for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
+        if (src->visited_hash_set.keys[i]) {
+            ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
         }
     }
 }
@@ -18279,7 +18277,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
 void ggml_graph_clear(struct ggml_cgraph * cgraph) {
     cgraph->n_leafs = 0;
     cgraph->n_nodes = 0;
-    memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
+    ggml_hash_set_reset(&cgraph->visited_hash_set);
 }
 
 //
@@ -18471,7 +18469,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                         n_tasks = n_threads;
                     } break;
                 default:
-                    GGML_ASSERT(false);
+                    GGML_ABORT("fatal error");
             }
             break;
         case GGML_OP_SILU_BACK:
@@ -18598,8 +18596,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_COUNT:
             {
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
         default:
             {
                 fprintf(stderr, "%s: op not implemented: ", __func__);
@@ -18608,8 +18606,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 } else {
                     fprintf(stderr, "%d\n", node->op);
                 }
-                GGML_ASSERT(false);
-            } break;
+                GGML_ABORT("fatal error");
+            }
     }
 
     assert(n_tasks > 0);
@@ -18719,7 +18717,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                         cur += sizeof(float)*ne00*ne01*ne02;
                         cur += sizeof(float)*ne10*ne11;
                     } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
                 } break;
             case GGML_OP_CONV_TRANSPOSE_2D:
@@ -18765,8 +18763,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                 } break;
             case GGML_OP_COUNT:
                 {
-                    GGML_ASSERT(false);
-                } break;
+                    GGML_ABORT("fatal error");
+                }
             default:
                 break;
         }
@@ -20000,9 +19998,9 @@ static enum ggml_opt_result linesearch_backtracking(
         (*step) *= width;
     }
 
-    GGML_ASSERT(false && "line search failed");
+    GGML_ABORT("line search failed");
 
-    return GGML_LINESEARCH_FAIL;
+    //return GGML_LINESEARCH_FAIL;
 }
 
 static enum ggml_opt_result ggml_opt_lbfgs(
@@ -20270,9 +20268,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         step[0] = 1.0;
     }
 
-    GGML_ASSERT(false && "lbfgs failed");
+    GGML_ABORT("lbfgs failed");
 
-    return GGML_OPT_RESULT_DID_NOT_CONVERGE;
+    //return GGML_OPT_RESULT_DID_NOT_CONVERGE;
 }
 
 struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@@ -20967,10 +20965,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                     }
                                 } break;
                             case GGUF_TYPE_ARRAY:
-                            default: GGML_ASSERT(false && "invalid type"); break;
+                            default: GGML_ABORT("invalid type");
                         }
                     } break;
-                default: GGML_ASSERT(false && "invalid type");
+                default: GGML_ABORT("invalid type");
             }
 
             if (!ok) {
@@ -21551,12 +21549,12 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
                         gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
                         GGML_FREE((void *)data);
                     } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
-                        GGML_ASSERT(false && "nested arrays not supported");
+                        GGML_ABORT("nested arrays not supported");
                     } else {
                         gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
                     }
                 } break;
-            default: GGML_ASSERT(false && "invalid type"); break;
+            default: GGML_ABORT("invalid type");
         }
     }
 }
@@ -21565,7 +21563,7 @@ void gguf_add_tensor(
              struct gguf_context * ctx,
         const struct ggml_tensor * tensor) {
     if (gguf_find_tensor(ctx, tensor->name) != -1) {
-        GGML_ASSERT(false && "duplicated tensor name");
+        GGML_ABORT("duplicated tensor name");
     }
 
     const int idx = ctx->header.n_tensors;
@@ -21598,7 +21596,7 @@ void gguf_add_tensor(
 void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
     const int idx = gguf_find_tensor(ctx, name);
     if (idx < 0) {
-        GGML_ASSERT(false && "tensor not found");
+        GGML_ABORT("tensor not found");
     }
 
     ctx->infos[idx].type = type;
@@ -21607,7 +21605,7 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
 void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
     const int idx = gguf_find_tensor(ctx, name);
     if (idx < 0) {
-        GGML_ASSERT(false && "tensor not found");
+        GGML_ABORT("tensor not found");
     }
 
     ctx->infos[idx].data = data;
@@ -21736,10 +21734,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
                                 }
                             } break;
                         case GGUF_TYPE_ARRAY:
-                        default: GGML_ASSERT(false && "invalid type"); break;
+                        default: GGML_ABORT("invalid type");
                     }
                 } break;
-            default: GGML_ASSERT(false && "invalid type");
+            default: GGML_ABORT("invalid type");
         }
     }
 
@@ -21800,7 +21798,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
 void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
     FILE * file = ggml_fopen(fname, "wb");
     if (!file) {
-        GGML_ASSERT(false && "failed to open file for writing");
+        GGML_ABORT("failed to open file for writing");
     }
 
     struct gguf_buf buf = gguf_buf_init(16*1024);
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index bd9322e2f..b123d7331 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -221,7 +221,7 @@ static void llama_grammar_advance_stack(
             // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
             // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
             // those
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
 }
 
@@ -517,7 +517,7 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc
                 return;
             }
         }
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 
     const std::string & piece = vocab->cache_token_to_piece.at(token);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index c482b3689..133094904 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -152,14 +152,14 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
             return strtol(buf.c_str(), NULL, 16);
         }
         case LLAMA_VOCAB_TYPE_BPE: {
-            GGML_ASSERT(false);
-            return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
+            GGML_ABORT("fatal error");
+            //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
         }
         case LLAMA_VOCAB_TYPE_WPM: {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
 }
 
@@ -1396,7 +1396,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
                 }
             } break;
         case LLAMA_VOCAB_TYPE_NONE:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
 
     return output;
@@ -1422,7 +1422,7 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
             return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
         }
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
 }
 
@@ -1606,7 +1606,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
                 break;
             }
             default:
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
         }
     }
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 77f7d32f8..bc830c0ef 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2259,8 +2259,7 @@ struct llama_hparams {
             return n_head_arr[il];
         }
 
-        GGML_ASSERT(false);
-        return 0;
+        GGML_ABORT("fatal error");
     }
 
     uint32_t n_head_kv(uint32_t il = 0) const {
@@ -2268,8 +2267,7 @@ struct llama_hparams {
             return n_head_kv_arr[il];
         }
 
-        GGML_ASSERT(false);
-        return 0;
+        GGML_ABORT("fatal error");
     }
 
     uint32_t n_ff(uint32_t il = 0) const {
@@ -2277,8 +2275,7 @@ struct llama_hparams {
             return n_ff_arr[il];
         }
 
-        GGML_ASSERT(false);
-        return 0;
+        GGML_ABORT("fatal error");
     }
 
     uint32_t n_gqa(uint32_t il = 0) const {
@@ -8072,7 +8069,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
                 cb(gate, "ffn_moe_gelu", il);
             } break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
 
     ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
@@ -8635,8 +8632,8 @@ struct llm_build_context {
                 } break;
             default:
                 {
-                    GGML_ASSERT(false && "unknown pooling type");
-                } break;
+                    GGML_ABORT("unknown pooling type");
+                }
         }
 
         cb(cur, "result_embd_pooled", -1);
@@ -8891,7 +8888,7 @@ struct llm_build_context {
                         Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
                         break;
                     default:
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                 }
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
@@ -11723,7 +11720,7 @@ struct llm_build_context {
                 switch (model.type) {
                     case e_model::MODEL_9B:  Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));   break;
                     case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
-                    default: GGML_ASSERT(false);
+                    default: GGML_ABORT("fatal error");
                 };
                 cb(Qcur, "Qcur_scaled", il);
 
@@ -13888,7 +13885,7 @@ static struct ggml_cgraph * llama_build_graph(
                 result = llm.build_jais();
             } break;
         default:
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
     }
 
     // add on pooling layer
@@ -14687,8 +14684,8 @@ static int llama_decode_internal(
                     } break;
                 case LLAMA_POOLING_TYPE_UNSPECIFIED:
                     {
-                        GGML_ASSERT(false && "unknown pooling type");
-                    } break;
+                        GGML_ABORT("unknown pooling type");
+                    }
             }
         }
         n_outputs_prev += lctx.n_outputs;
@@ -15079,7 +15076,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     // apply K-shift if needed
     if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
         if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
-            GGML_ASSERT(false && "Deepseek2 does not support K-shift");
+            GGML_ABORT("Deepseek2 does not support K-shift");
         }
 
         {
@@ -15218,7 +15215,7 @@ static void llama_tensor_dequantize_internal(
         } else if (ggml_is_quantized(tensor->type)) {
             qtype.to_float(tensor->data, f32_output, nelements);
         } else {
-            GGML_ASSERT(false); // unreachable
+            GGML_ABORT("fatal error"); // unreachable
         }
         return;
     }
@@ -16904,8 +16901,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
-            GGML_ASSERT(false && "unknown architecture");
-            break;
+            GGML_ABORT("unknown architecture");
     }
 
     return LLAMA_ROPE_TYPE_NONE;
@@ -18469,7 +18465,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
 #endif
         return nullptr;
     }
@@ -18514,7 +18510,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
 #ifndef NDEBUG
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
 #endif
         return nullptr;
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 2c03c60d4..2fa59fd0a 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -94,7 +94,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
         // This is going to create some weird integers though.
         ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
     } else {
-        GGML_ASSERT(false);
+        GGML_ABORT("fatal error");
     }
 }
 
@@ -132,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                         tt.to_float(&buf[i], vq.data(), bs);
                         tv.insert(tv.end(), vq.begin(), vq.end());
                     } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
                 }
             }
@@ -1435,7 +1435,7 @@ struct test_argsort : public test_case {
                     ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
                 }
             } else {
-                GGML_ASSERT(false);
+                GGML_ABORT("fatal error");
             }
         }
     }
@@ -2462,7 +2462,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         return true;
     }
 
-    GGML_ASSERT(false);
+    GGML_ABORT("fatal error");
     return false;
 }
 
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 6374958fe..de858bd3b 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -166,12 +166,12 @@ static void test_sampler_queue(
     for (auto s : samplers_sequence) {
         switch (s){
             case 'k': llama_sample_top_k    (nullptr, &candidates_p, top_k, 1); break;
-            case 'f': GGML_ASSERT(false && "tail_free test not implemented");   break;
-            case 'y': GGML_ASSERT(false && "typical test not implemented");     break;
+            case 'f': GGML_ABORT("tail_free test not implemented");   break;
+            case 'y': GGML_ABORT("typical test not implemented");     break;
             case 'p': llama_sample_top_p    (nullptr, &candidates_p, top_p, 1); break;
             case 'm': llama_sample_min_p    (nullptr, &candidates_p, min_p, 1); break;
-            case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
-            default : GGML_ASSERT(false && "Unknown sampler");                  break;
+            case 't': GGML_ABORT("temperature test not implemented"); break;
+            default : GGML_ABORT("Unknown sampler");                  break;
         }
 
         llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
@@ -222,7 +222,7 @@ static void test_sampler_queue(
             GGML_ASSERT(candidates_p.data[0].id == max_token_id);
             GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }
 

From bfb4c74981f0a40d757b450b596a9fe4ca983d26 Mon Sep 17 00:00:00 2001
From: wangshuai09 <391746016@qq.com>
Date: Sat, 27 Jul 2024 16:36:44 +0800
Subject: [PATCH 21/35] cann: Fix Multi-NPU execution error (#8710)

* cann: fix multi-npu exec error

* cann: update comment  for ggml_backend_cann_supports_buft
---
 ggml/src/ggml-cann.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index ad5feea05..461febcc0 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1559,23 +1559,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
             return false;
         }
 
+        // need open both directions for memcpyasync between devices.
+        ggml_cann_set_device(cann_ctx_dst->device);
+        ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
         ggml_cann_set_device(cann_ctx_src->device);
         ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
+
         ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
                                    ACL_MEMCPY_DEVICE_TO_DEVICE,
-                                   cann_ctx_dst->stream()));
+                                   cann_ctx_src->stream()));
 
-        // record event on src stream
-        if (!cann_ctx_src->copy_event) {
-            ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event));
-        }
-
-        ACL_CHECK(
-            aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
-
-        // wait on dst stream for the copy to complete
-        ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(),
-                                       cann_ctx_src->copy_event));
+        //TODO: workaround for Event didn`t work here.
+        aclrtSynchronizeStream(cann_ctx_src->stream());
     } else {
         // src and dst are on the same backend
         ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
@@ -1763,8 +1758,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
  *
  * This function determines whether the CANN backend supports the given backend
  * buffer type by comparing the device context of the backend and buffer type.
- * It returns true if the device associated with the buffer type matches the
- * device associated with the backend.
+ * It returns true if the devices are same between the backend context and
+ * buffer type context.
  *
  * @param backend Pointer to the CANN backend.
  * @param buft Pointer to the backend buffer type to check.
@@ -1773,9 +1768,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
  */
 GGML_CALL static bool ggml_backend_cann_supports_buft(
     ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
-
-    GGML_UNUSED(backend);
+    if (ggml_backend_buft_is_cann(buft)) {
+        ggml_backend_cann_context * cann_ctx =
+                        (ggml_backend_cann_context *)backend->context;
+        ggml_backend_cann_buffer_type_context * buft_ctx =
+                        (ggml_backend_cann_buffer_type_context *)buft->context;
+        return buft_ctx->device == cann_ctx->device;
+    }
+    return false;
 }
 
 /**

From 9d03d085dd6cb275c078690bb64073b9b043e95f Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Sat, 27 Jul 2024 12:45:02 +0200
Subject: [PATCH 22/35] common : add --no-warmup option for main/llama-cli
 (#8712)

This commit adds a --no-warmup option for llama-cli.

The motivation for this is that it can be convenient to skip the
warmup llama_decode call when debugging.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 common/common.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index ec44a0552..60c7eac75 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1324,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         else { invalid_param = true; }
         return true;
     }
+    if (arg == "--no-warmup") {
+        params.warmup = false;
+        return true;
+    }
 #ifndef LOG_DISABLE_LOGS
     // Parse args for logging parameters
     if (log_param_single_parse(argv[i])) {
@@ -1446,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
     options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
     options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
+    options.push_back({ "main",        "       --no-warmup",            "skip warming up the model with an empty run" });
     options.push_back({ "server infill",
                                        "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
 

From 92090eca212650727e38b335c1d4accfbcc9b79c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 Jul 2024 14:59:29 +0300
Subject: [PATCH 23/35] llama : add function for model-based max number of
 graph nodes (#8622)

* llama : model-based max number of graph nodes

ggml-ci

* llama : disable 405B max_nodes path due to lack of complaints

ggml-ci
---
 src/llama.cpp | 96 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 43 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index bc830c0ef..c9cdbb343 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -101,7 +101,6 @@
 #endif
 
 // bump if necessary
-#define LLAMA_MAX_NODES   8192
 #define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
 
@@ -3567,6 +3566,15 @@ namespace GGUFMeta {
 
 using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
 
+// TODO: update when needed or think of some clever automatic way to do this
+static size_t llama_model_max_nodes(const llama_model & /*model*/) {
+    //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
+    //    return 32768;
+    //}
+
+    return 8192;
+}
+
 struct llama_model_loader {
     int n_kv      = 0;
     int n_tensors = 0;
@@ -8396,7 +8404,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_k_shift() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         GGML_ASSERT(kv_self.size == n_ctx);
 
@@ -8427,7 +8435,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_s_copy() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         GGML_ASSERT(kv_self.recurrent);
 
@@ -8450,7 +8458,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         for (uint32_t i = 0; i < ids.size(); ++i) {
             const uint32_t id = ids[i];
@@ -8691,7 +8699,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_llama() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -8834,7 +8842,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_baichuan() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -8949,7 +8957,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_xverse() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9052,7 +9060,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_falcon() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -9172,7 +9180,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_grok() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -9329,7 +9337,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_dbrx() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -9455,7 +9463,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_starcoder() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -9559,7 +9567,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_refact() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -9653,7 +9661,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_bert() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -9847,7 +9855,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_bloom() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -9948,7 +9956,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_mpt() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -10238,7 +10246,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_qwen() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10350,7 +10358,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_qwen2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -10462,7 +10470,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_qwen2moe() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -10608,7 +10616,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_phi2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -10729,7 +10737,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_phi3() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@@ -10961,7 +10969,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gpt2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -11066,7 +11074,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_codeshell() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -11177,7 +11185,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_orion() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11295,7 +11303,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_internlm2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11416,7 +11424,7 @@ struct llm_build_context {
     //      https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
     // based on the original build_llama() function
     struct ggml_cgraph * build_minicpm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11560,7 +11568,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gemma() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head_k = hparams.n_embd_head_k;
 
@@ -11668,7 +11676,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gemma2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head_k = hparams.n_embd_head_k;
 
@@ -11803,7 +11811,7 @@ struct llm_build_context {
 
 
     struct ggml_cgraph * build_starcoder2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -11922,7 +11930,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_mamba() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t d_model = n_embd;
         const int64_t d_conv  = hparams.ssm_d_conv;
@@ -12071,7 +12079,7 @@ struct llm_build_context {
 
     struct ggml_cgraph * build_command_r() {
 
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12225,7 +12233,7 @@ struct llm_build_context {
     //   * removed bias
     //   * removed MoE
     struct ggml_cgraph * build_olmo() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -12349,7 +12357,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_openelm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -12474,7 +12482,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_gptneox() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -12616,7 +12624,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_arctic() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -12748,7 +12756,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_deepseek2() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -12976,7 +12984,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_bitnet() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -13116,7 +13124,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_t5() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
         int32_t n_tokens = this->n_tokens;
@@ -13433,7 +13441,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_jais() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -13525,7 +13533,7 @@ struct llm_build_context {
     }
 
     struct ggml_cgraph * build_chatglm() {
-        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
@@ -14870,9 +14878,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     // each move requires 6*n_layer tensors (see build_defrag)
     //   - source view, destination view, copy operation
     //   - x2 for keys and values
-    //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
+    //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
     // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
-    const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
+    const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
 
     // determine which KV cells to move where
     //
@@ -16762,8 +16770,10 @@ struct llama_context * llama_new_context_with_model(
                 }
             }
 
+            const size_t max_nodes = llama_model_max_nodes(*model);
+
             // buffer used to store the computation graph and the tensor meta data
-            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
+            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
 
             // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
             bool pipeline_parallel =
@@ -16776,7 +16786,7 @@ struct llama_context * llama_new_context_with_model(
             // currently this is only implemented in the CUDA backend
             pipeline_parallel = false;
 #endif
-            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
+            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
 
             if (pipeline_parallel) {
                 LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));

From b5e95468b1676e1e5c9d80d1eeeb26f542a38f42 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sat, 27 Jul 2024 05:03:45 -0700
Subject: [PATCH 24/35] llama : add support for llama 3.1 rope scaling factors
 (#8676)

* Add llama 3.1 rope scaling factors to llama conversion and inference

This commit generates the rope factors on conversion and adds them to the resulting model as a tensor. At inference time, these factors are passed to the `ggml_rope_ext` rope oepration, improving results for context windows above 8192

* Update convert_hf_to_gguf.py

Co-authored-by: compilade <git@compilade.net>

* address comments

* address comments

* Update src/llama.cpp

Co-authored-by: compilade <git@compilade.net>

* Update convert_hf_to_gguf.py

Co-authored-by: compilade <git@compilade.net>

---------

Co-authored-by: compilade <git@compilade.net>
---
 convert_hf_to_gguf.py | 28 ++++++++++++++++++++++++++++
 src/llama.cpp         | 14 ++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 4087187c1..8ba3c5844 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1570,6 +1570,34 @@ class LlamaModel(Model):
         return [(self.map_tensor_name(name), data_torch)]
 
     def prepare_tensors(self):
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", '').lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10000.0)
+                dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+                factor = rope_scaling.get("factor", 8.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+                assert low_freq_wavelen != high_freq_wavelen
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
+
         super().prepare_tensors()
 
         if self._experts is not None:
diff --git a/src/llama.cpp b/src/llama.cpp
index c9cdbb343..0345d0062 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2451,6 +2451,7 @@ struct llama_layer {
     // long rope factors
     struct ggml_tensor * rope_long  = nullptr;
     struct ggml_tensor * rope_short = nullptr;
+    struct ggml_tensor * rope_freqs = nullptr;
 
     // bitnet scale
     struct ggml_tensor * wq_scale;
@@ -6059,6 +6060,8 @@ static bool llm_load_tensors(
 
                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
+                        layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
                         if (n_expert == 0) {
                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
@@ -8536,6 +8539,10 @@ struct llm_build_context {
         // choose long/short freq factors based on the context size
         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
 
+        if (model.layers[il].rope_freqs != nullptr) {
+            return model.layers[il].rope_freqs;
+        }
+
         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
             return model.layers[il].rope_long;
         }
@@ -8730,6 +8737,9 @@ struct llm_build_context {
 
             // self-attention
             {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
@@ -8753,14 +8763,14 @@ struct llm_build_context {
                 }
 
                 Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );

From c12b6e8ee7d905e0f299caf311689189fb1b4ac5 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 8 Jul 2024 12:03:42 +0200
Subject: [PATCH 25/35] ggml : remove unnecessary UNUSED macro call (ggml/880)

This commit removes an UNUSED macro call that is not needed as the
variable n0 is used in the code and will not produce a warning.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 ggml/src/ggml.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c196fd5bf..a14d0d1db 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18078,7 +18078,6 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
     }
 
     const int n0 = cgraph->n_nodes;
-    UNUSED(n0);
 
     ggml_visit_parents(cgraph, tensor);
 

From d2b851bfa131478665315bc5c7c707506c14d703 Mon Sep 17 00:00:00 2001
From: Borislav Stanimirov <b.stanimirov@abv.bg>
Date: Fri, 12 Jul 2024 17:24:20 +0300
Subject: [PATCH 26/35] cmake : only enable GGML_NATIVE and x86 flags if not
 crosscompiling (ggml/885)

---
 ggml/CMakeLists.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index be22a7460..1768a508b 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -50,9 +50,15 @@ else()
     set(GGML_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 
+if (CMAKE_CROSSCOMPILING)
+    set(GGML_NATIVE_DEFAULT OFF)
+else()
+    set(GGML_NATIVE_DEFAULT ON)
+endif()
+
 # general
 option(GGML_STATIC "ggml: static link libraries"         OFF)
-option(GGML_NATIVE "ggml: enable -march=native flag"     ON)
+option(GGML_NATIVE "ggml: enable -march=native flag"     ${GGML_NATIVE_DEFAULT})
 option(GGML_LTO    "ggml: enable link time optimization" OFF)
 option(GGML_CCACHE "ggml: use ccache if available"       ON)
 
@@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
 option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
 
 # instruction set specific
-if (GGML_NATIVE)
+if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
     set(INS_ENB OFF)
 else()
     set(INS_ENB ON)

From 203b7f1531303a060730ec1d1e01920e70302398 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <4840017+neobrain@users.noreply.github.com>
Date: Sat, 20 Jul 2024 20:49:44 +0200
Subject: [PATCH 27/35] vulkan : initialize vk_buffer_struct members to
 VK_NULL_HANDLE (ggml/893)

This prevents invalid frees when destroying a partially initialized
vk_buffer_struct. For example, this could happen in ggml_vk_create_buffer
when running out of device memory.

Co-authored-by: Tony Wasserka <neobrain@users.noreply.github.com>
---
 ggml/src/ggml-vulkan.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 74991f6d1..fa68360b9 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -236,8 +236,8 @@ struct vk_device_struct {
 };
 
 struct vk_buffer_struct {
-    vk::Buffer buffer;
-    vk::DeviceMemory device_memory;
+    vk::Buffer buffer = VK_NULL_HANDLE;
+    vk::DeviceMemory device_memory = VK_NULL_HANDLE;
     vk::MemoryPropertyFlags memory_property_flags;
     void * ptr;
     size_t size = 0;

From 9f77d899b7b0d56496f679e54b797da6199fed8e Mon Sep 17 00:00:00 2001
From: Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:32:02 +0300
Subject: [PATCH 28/35] ggml: add support for float16 input tensors in pooling
 operations (ggml/895)

* Add support for float16 tensors in 1d pooling operations

* Add support for float16 input tensors in 2d pooling operations

* code cleanup

remove unnecessary casting during srow ptr initialization

---------

Co-authored-by: vanaka11 <vanaka1189@gmail.com>
---
 ggml/src/ggml.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index a14d0d1db..c76d00a39 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -14746,7 +14746,7 @@ static void ggml_compute_forward_pool_1d_sk_p0(
 
     const struct ggml_tensor * src = dst->src[0];
 
-    assert(src->type == GGML_TYPE_F32);
+    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
 
     if (params->ith != 0) {
         return;
@@ -14759,10 +14759,8 @@ static void ggml_compute_forward_pool_1d_sk_p0(
     const int64_t rs = dst->ne[0];
 
     while (cdata < data_end) {
-        const float * const srow = (const float *)cdata;
-
+        const void * srow = (const void *)cdata;
         int j = 0;
-
         for (int64_t i = 0; i < rs; ++i) {
             switch (op) {
                 case GGML_OP_POOL_AVG:   drow[i] = 0;        break;
@@ -14770,10 +14768,11 @@ static void ggml_compute_forward_pool_1d_sk_p0(
                 case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error");
             }
             for (int ki = 0; ki < k; ++ki) {
+                const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
                 switch (op) {
-                    case GGML_OP_POOL_AVG:                          drow[i] += srow[j]; break;
-                    case GGML_OP_POOL_MAX:   if (srow[j] > drow[i]) drow[i]  = srow[j]; break;
-                    case GGML_OP_POOL_COUNT:                        GGML_ABORT("fatal error");
+                    case GGML_OP_POOL_AVG:                         drow[i] += srow_j; break;
+                    case GGML_OP_POOL_MAX:   if (srow_j > drow[i]) drow[i]  = srow_j; break;
+                    case GGML_OP_POOL_COUNT:                       GGML_ABORT("fatal error");
                 }
                 ++j;
             }
@@ -14814,7 +14813,7 @@ static void ggml_compute_forward_pool_2d(
 
     const struct ggml_tensor * src = dst->src[0];
 
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
 
     if (params->ith != 0) {
         return;
@@ -14857,14 +14856,15 @@ static void ggml_compute_forward_pool_2d(
 
                 for (int ky = 0; ky < k1; ++ky) {
                     if (iy + ky < 0 || iy + ky >= src->ne[1]) continue;
-                    const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
+                    const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky));
                     for (int kx = 0; kx < k0; ++kx) {
                         int j = ix + kx;
                         if (j < 0 || j >= src->ne[0]) continue;
+                        const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]);
                         switch (op) {
-                            case GGML_OP_POOL_AVG:                     *out += srow[j]; break;
-                            case GGML_OP_POOL_MAX: if (srow[j] > *out) *out  = srow[j]; break;
-                            case GGML_OP_POOL_COUNT:                GGML_ABORT("fatal error");
+                            case GGML_OP_POOL_AVG:                     *out += srow_j; break;
+                            case GGML_OP_POOL_MAX: if (srow_j > *out)  *out  = srow_j; break;
+                            case GGML_OP_POOL_COUNT:               GGML_ABORT("fatal error");
                         }
                     }
                 }

From a05ca9369716a8319014cd1fc365980d43f8aae9 Mon Sep 17 00:00:00 2001
From: Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
Date: Thu, 25 Jul 2024 00:54:08 -0700
Subject: [PATCH 29/35] ggml : loop tiling optimizations for scalar path
 (ggml/898)

Apply a loop tiling technique to the generic path, which provides
performance upside for ISAs with enough registers to take advantage
of it. Also helps the compiler optimize this path.
---
 ggml/src/ggml-quants.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 1c6c85aac..aa936fe5f 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4190,15 +4190,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
 #endif
     for (; ib < nb; ++ib) {
-        int sumi = 0;
+        int sumi0 = 0;
+        int sumi1 = 0;
 
         for (int j = 0; j < qk/2; ++j) {
             const int v0 = (x[ib].qs[j] & 0x0F) - 8;
             const int v1 = (x[ib].qs[j] >>   4) - 8;
 
-            sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
         }
 
+        int sumi = sumi0 + sumi1;
         sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
     }
 
@@ -4474,15 +4477,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     sumf = hsum_float_8(acc) + summs;
 #endif
     for (; ib < nb; ++ib) {
-        int sumi = 0;
+        int sumi0 = 0
+        int sumi1 = 0;
 
         for (int j = 0; j < qk/2; ++j) {
             const int v0 = (x[ib].qs[j] & 0x0F);
             const int v1 = (x[ib].qs[j] >>   4);
 
-            sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
         }
 
+        int sumi = sumi0 + sumi1;
         sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
     }
 
@@ -4823,18 +4829,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         uint32_t qh;
         memcpy(&qh, x[ib].qh, sizeof(qh));
 
-        int sumi = 0;
+        int sumi0 = 0;
+        int sumi1 = 0;
 
         for (int j = 0; j < qk/2; ++j) {
             const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
             const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-            const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[ib].qs[j] >>   4) | xh_1) - 16;
+            const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
+            const int32_t x1 = (int8_t)(((x[ib].qs[j] >>   4) | xh_1) - 16);
 
-            sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
         }
 
+        int sumi = sumi0 + sumi1;
         sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
     }
 
@@ -5194,7 +5203,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
         uint32_t qh;
         memcpy(&qh, x[ib].qh, sizeof(qh));
 
-        int sumi = 0;
+        int sumi0 = 0;
+        int sumi1 = 0;
 
         for (int j = 0; j < qk/2; ++j) {
             const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
@@ -5203,9 +5213,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
             const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
             const int32_t x1 = (x[ib].qs[j] >>  4) | xh_1;
 
-            sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
+            sumi0 += (x0 * y[ib].qs[j]);
+            sumi1 += (x1 * y[ib].qs[j + qk/2]);
         }
 
+        int sumi = sumi0 + sumi1;
         sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
     }
 

From ae7985cd7beca3b849328d169a8d592469cd021f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 Jul 2024 15:53:48 +0300
Subject: [PATCH 30/35] sync : ggml

ggml-ci
---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 80159b70b..998b23ac6 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-e3b3846976c94163f2b3dd128cc959782653edbb
+31d544f87835a55602883fe09156bb85a4c163d8

From 345c8c0c87a97c1595f9c8b14833d531c8c7d8df Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 Jul 2024 15:57:09 +0300
Subject: [PATCH 31/35] ggml : add missing semicolon (#0)

ggml-ci
---
 ggml/src/ggml-quants.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index aa936fe5f..9016314f5 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4477,7 +4477,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     sumf = hsum_float_8(acc) + summs;
 #endif
     for (; ib < nb; ++ib) {
-        int sumi0 = 0
+        int sumi0 = 0;
         int sumi1 = 0;
 
         for (int j = 0; j < qk/2; ++j) {

From 56f20aa25d5f97248a204b473c99f4040900f0e5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 Jul 2024 17:19:35 +0300
Subject: [PATCH 32/35] scripts : sync ggml-aarch64 sources

---
 scripts/sync-ggml-am.sh | 4 ++++
 scripts/sync-ggml.sh    | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index ba3bedf21..f624e4881 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -102,6 +102,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # cmake/FindSIMD.cmake    -> ggml/cmake/FindSIMD.cmake
     #
     # src/ggml.c              -> ggml/src/ggml.c
+    # src/ggml-aarch64.c      -> ggml/src/ggml-aarch64.c
+    # src/ggml-aarch64.h      -> ggml/src/ggml-aarch64.h
     # src/ggml-alloc.c        -> ggml/src/ggml-alloc.c
     # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
     # src/ggml-backend.c      -> ggml/src/ggml-backend.c
@@ -143,6 +145,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
         -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index 402446ef9..af3784c3d 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -5,6 +5,8 @@ cp -rpv ../ggml/src/CMakeLists.txt   ./ggml/src/CMakeLists.txt
 cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
 
 cp -rpv ../ggml/src/ggml.c              ./ggml/src/ggml.c
+cp -rpv ../ggml/src/ggml-aarch64.c      ./ggml/src/ggml-aarch64.c
+cp -rpv ../ggml/src/ggml-aarch64.h      ./ggml/src/ggml-aarch64.h
 cp -rpv ../ggml/src/ggml-alloc.c        ./ggml/src/ggml-alloc.c
 cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
 cp -rpv ../ggml/src/ggml-backend.c      ./ggml/src/ggml-backend.c

From 5e2727fe0321c38d1664d26173c654fa1801dc5f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 Jul 2024 18:08:31 +0300
Subject: [PATCH 33/35] scripts : sync vulkan-shaders (#0)

---
 scripts/sync-ggml-am.sh | 2 ++
 scripts/sync-ggml.sh    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index f624e4881..c40025356 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -119,6 +119,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml-sycl/*         -> ggml/src/ggml-sycl/
     # src/ggml-sycl.cpp       -> ggml/src/ggml-sycl.cpp
     # src/ggml-vulkan.cpp     -> ggml/src/ggml-vulkan.cpp
+    # src/vulkan-shaders/*    -> ggml/src/vulkan-shaders/
     #
     # include/ggml.h         -> ggml/include/ggml.h
     # include/ggml-alloc.h   -> ggml/include/ggml-alloc.h
@@ -162,6 +163,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
         -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
+        -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
         -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index af3784c3d..d6d7d0a60 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -23,6 +23,7 @@ cp -rpv ../ggml/src/ggml-rpc.cpp        ./ggml/src/ggml-rpc.cpp
 cp -rpv ../ggml/src/ggml-sycl/*         ./ggml/src/ggml-sycl/
 cp -rpv ../ggml/src/ggml-sycl.cpp       ./ggml/src/ggml-sycl.cpp
 cp -rpv ../ggml/src/ggml-vulkan.cpp     ./ggml/src/ggml-vulkan.cpp
+cp -rpv ../ggml/src/vulkan-shaders/*    ./ggml/src/vulkan-shaders/
 
 cp -rpv ../ggml/include/ggml.h         ./ggml/include/ggml.h
 cp -rpv ../ggml/include/ggml-alloc.h   ./ggml/include/ggml-alloc.h

From e54c35e4fb5777c76316a50671640e6e144c9538 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <yeahdongcn@gmail.com>
Date: Sun, 28 Jul 2024 07:41:25 +0800
Subject: [PATCH 34/35] feat: Support Moore Threads GPU  (#8383)

* Update doc for MUSA

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* Add GGML_MUSA in Makefile

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* Add GGML_MUSA in CMake

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* CUDA => MUSA

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* MUSA adds support for __vsubss4

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* Fix CI build failure

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

---------

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 Makefile                      |  57 ++++++++--
 README.md                     |   1 +
 docs/build.md                 |  13 +++
 ggml/CMakeLists.txt           |   1 +
 ggml/include/ggml-cuda.h      |   3 +
 ggml/src/CMakeLists.txt       |  62 +++++++++--
 ggml/src/ggml-common.h        |   6 +-
 ggml/src/ggml-cuda.cu         |  22 ++--
 ggml/src/ggml-cuda/common.cuh | 194 +++++++++++++++++++++++++++++++++-
 9 files changed, 329 insertions(+), 30 deletions(-)

diff --git a/Makefile b/Makefile
index 7e015af3e..c82f4268a 100644
--- a/Makefile
+++ b/Makefile
@@ -528,10 +528,21 @@ ifndef GGML_NO_ACCELERATE
 	endif
 endif # GGML_NO_ACCELERATE
 
+ifdef GGML_MUSA
+	CC := clang
+	CXX := clang++
+	GGML_CUDA := 1
+	MK_CPPFLAGS += -DGGML_USE_MUSA
+endif
+
 ifndef GGML_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
+	ifdef GGML_MUSA
+		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
+		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
+	endif # GGML_MUSA
 endif # GGML_NO_OPENMP
 
 ifdef GGML_OPENBLAS
@@ -582,15 +593,27 @@ else
 endif # GGML_CUDA_FA_ALL_QUANTS
 
 ifdef GGML_CUDA
-	ifneq ('', '$(wildcard /opt/cuda)')
-		CUDA_PATH ?= /opt/cuda
-	else
-		CUDA_PATH ?= /usr/local/cuda
-	endif
+	ifdef GGML_MUSA
+		ifneq ('', '$(wildcard /opt/musa)')
+			CUDA_PATH ?= /opt/musa
+		else
+			CUDA_PATH ?= /usr/local/musa
+		endif
 
-	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-	MK_NVCCFLAGS += -use_fast_math
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
+		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
+		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
+	else
+		ifneq ('', '$(wildcard /opt/cuda)')
+			CUDA_PATH ?= /opt/cuda
+		else
+			CUDA_PATH ?= /usr/local/cuda
+		endif
+
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+		MK_NVCCFLAGS += -use_fast_math
+	endif # GGML_MUSA
 
 	OBJ_GGML += ggml/src/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@@ -600,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS
 
+ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
+endif # GGML_MUSA
 
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
@@ -615,8 +640,12 @@ endif # GGML_CUDA_DEBUG
 ifdef GGML_CUDA_NVCC
 	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
-	NVCC = $(CCACHE) nvcc
-endif #GGML_CUDA_NVCC
+	ifdef GGML_MUSA
+		NVCC = $(CCACHE) mcc
+	else
+		NVCC = $(CCACHE) nvcc
+	endif # GGML_MUSA
+endif # GGML_CUDA_NVCC
 
 ifdef CUDA_DOCKER_ARCH
 	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@@ -687,9 +716,15 @@ define NVCC_COMPILE
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
+	ifdef GGML_MUSA
+define NVCC_COMPILE
+	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
+endef # NVCC_COMPILE
+	else
 define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
+	endif # GGML_MUSA
 endif # JETSON_EOL_MODULE_DETECT
 
 ggml/src/ggml-cuda/%.o: \
@@ -944,6 +979,7 @@ $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+ifndef GGML_MUSA
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 
 ifndef CUDA_DOCKER_ARCH
@@ -953,6 +989,7 @@ endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
+endif # GGML_MUSA
 endif # GGML_CUDA
 $(info )
 
diff --git a/README.md b/README.md
index d0ae2efb9..775ce2c88 100644
--- a/README.md
+++ b/README.md
@@ -409,6 +409,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 | [BLAS](./docs/build.md#blas-build) | All |
 | [BLIS](./docs/backend/BLIS.md) | All |
 | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
 | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
 | [Vulkan](./docs/build.md#vulkan) | GPU |
diff --git a/docs/build.md b/docs/build.md
index d9d12c467..cfe42ebbf 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -192,6 +192,19 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
 
+### MUSA
+
+- Using `make`:
+  ```bash
+  make GGML_MUSA=1
+  ```
+- Using `CMake`:
+
+  ```bash
+  cmake -B build -DGGML_MUSA=ON
+  cmake --build build --config Release
+  ```
+
 ### hipBLAS
 
 This provides BLAS acceleration on HIP-supported AMD GPUs.
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 1768a508b..a5c2e96a8 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -113,6 +113,7 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
 option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             OFF)
 
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
+option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
index d7903c666..71bb6dcf0 100644
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -6,6 +6,9 @@
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
+#elif defined(GGML_USE_MUSA)
+#define GGML_CUDA_NAME "MUSA"
+#define GGML_CUBLAS_NAME "muBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index c6496c921..836496fb9 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -139,6 +139,17 @@ if (GGML_METAL)
         )
 endif()
 
+if (GGML_MUSA)
+    set(CMAKE_C_COMPILER clang)
+    set(CMAKE_C_EXTENSIONS OFF)
+    set(CMAKE_CXX_COMPILER clang++)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+
+    set(GGML_CUDA ON)
+
+    list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
+endif()
+
 if (GGML_OPENMP)
     find_package(OpenMP)
     if (OpenMP_FOUND)
@@ -147,6 +158,11 @@ if (GGML_OPENMP)
         add_compile_definitions(GGML_USE_OPENMP)
 
         set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+
+        if (GGML_MUSA)
+            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
+        endif()
     else()
         message(WARNING "OpenMP not found")
     endif()
@@ -249,7 +265,13 @@ endif()
 if (GGML_CUDA)
     cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
 
-    find_package(CUDAToolkit)
+    if (GGML_MUSA)
+        list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
+        find_package(MUSAToolkit)
+        set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
+    else()
+        find_package(CUDAToolkit)
+    endif()
 
     if (CUDAToolkit_FOUND)
         message(STATUS "CUDA found")
@@ -268,7 +290,11 @@ if (GGML_CUDA)
         endif()
         message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
-        enable_language(CUDA)
+        if (GGML_MUSA)
+            set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
+        else()
+            enable_language(CUDA)
+        endif()
 
         file(GLOB   GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
         list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@@ -332,21 +358,40 @@ if (GGML_CUDA)
             add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
         endif()
 
+        if (GGML_MUSA)
+            set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
+            foreach(SOURCE ${GGML_SOURCES_CUDA})
+                set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
+            endforeach()
+        endif()
+
         if (GGML_STATIC)
             if (WIN32)
                 # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
                 set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
             else ()
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+                if (GGML_MUSA)
+                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
+                else()
+                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+                endif()
             endif()
         else()
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+            if (GGML_MUSA)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
+            else()
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+            endif()
         endif()
 
         if (GGML_CUDA_NO_VMM)
             # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
         else()
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+            if (GGML_MUSA)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
+            else()
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+            endif()
         endif()
     else()
         message(WARNING "CUDA not found")
@@ -857,8 +902,10 @@ function(get_flags CCID CCVER)
         set(C_FLAGS   -Wdouble-promotion)
         set(CXX_FLAGS -Wno-array-bounds)
 
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            list(APPEND CXX_FLAGS -Wno-format-truncation)
+        if (NOT GGML_MUSA)
+            if (CCVER VERSION_GREATER_EQUAL 7.1.0)
+                list(APPEND CXX_FLAGS -Wno-format-truncation)
+            endif()
         endif()
         if (CCVER VERSION_GREATER_EQUAL 8.1.0)
             list(APPEND CXX_FLAGS -Wextra-semi)
@@ -1264,6 +1311,7 @@ endif()
 target_compile_definitions(ggml PUBLIC  ${GGML_CDEF_PUBLIC})
 target_include_directories(ggml PUBLIC ../include)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
+target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump
 
 target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index fafd5fa7a..e40057632 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -19,7 +19,11 @@ typedef half2 ggml_half2;
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_CUDA)
+#if defined(GGML_COMMON_DECL_MUSA)
+#include <musa_fp16.h>
+#else
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 
 typedef half  ggml_half;
@@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
 #define GGML_TABLE_END() };
 
 #define GGML_COMMON_IMPL
-#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
+#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
 #include <cstdint>
 
 #define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 54ccf6bb1..c73ae40d4 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -167,7 +167,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
     for (int id = 0; id < info.device_count; ++id) {
         int device_vmm = 0;
 
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
         CUdevice device;
         CU_CHECK(cuDeviceGet(&device, id));
         CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -179,7 +179,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
             alloc_prop.location.id = id;
             CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
         }
-#endif // !defined(GGML_USE_HIPBLAS)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
         info.devices[id].vmm = !!device_vmm;
 
         cudaDeviceProp prop;
@@ -315,7 +315,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };
 
 // pool with virtual memory
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
     static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
 
@@ -409,14 +409,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
         GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
     }
 };
-#endif // !defined(GGML_USE_HIPBLAS)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
 
 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
     if (ggml_cuda_info().devices[device].vmm) {
         return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
     }
-#endif
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
 
@@ -1341,7 +1341,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
     void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
 
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
     // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
     cudaMemcpy3DPeerParms p = {};
     p.dstDevice = dstDevice;
@@ -1355,7 +1355,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
     GGML_UNUSED(dstDevice);
     GGML_UNUSED(srcDevice);
     return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
-#endif // !defined(GGML_USE_HIPBLAS)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
 }
 
 static void ggml_cuda_op_mul_mat(
@@ -1828,6 +1828,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
         }
     }
 #else
+#ifdef GGML_USE_MUSA
+    GGML_ASSERT(false);
+#else // !GGML_USE_MUSA
     if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
         // use cublasGemmStridedBatchedEx
@@ -1870,6 +1873,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
                 cu_compute_type,
                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
+#endif // GGML_USE_MUSA
 #endif
 
     if (dst->op_params[0] == GGML_PREC_DEFAULT) {
@@ -3027,7 +3031,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
         return false;
     }
 
-#if CUDART_VERSION >= 11100
+#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
     cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
     if (err != cudaSuccess) {
         // clear the error
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index eac026f47..8c3c20b90 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -12,6 +12,10 @@
 #else
 #define GGML_COMMON_DECL_CUDA
 #define GGML_COMMON_IMPL_CUDA
+#if defined(GGML_USE_MUSA)
+#define GGML_COMMON_DECL_MUSA
+#define GGML_COMMON_IMPL_MUSA
+#endif
 #endif
 #include "ggml-common.h"
 
@@ -114,6 +118,150 @@
 #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#elif defined(GGML_USE_MUSA)
+#include <musa_runtime.h>
+#include <musa.h>
+#include <mublas.h>
+#include <musa_fp16.h>
+// XXX: Keep the following order the same as hipBLAS
+// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
+// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N MUBLAS_OP_N
+#define CUBLAS_OP_T MUBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
+// #define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  MUSA_R_16F
+#define CUDA_R_32F  MUSA_R_32F
+// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+// #define cublasComputeType_t mublasComputeType_t
+#define cublasCreate mublasCreate
+#define cublasDestroy mublasDestroy
+#define cublasGemmEx mublasGemmEx
+#define cublasGemmBatchedEx mublasGemmBatchedEx
+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
+#define cublasHandle_t mublasHandle_t
+// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetMathMode mublasSetMathMode
+#define cublasSetStream mublasSetStream
+#define cublasSgemm mublasSgemm
+#define cublasStatus_t mublasStatus_t
+#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
+#define cudaDeviceProp musaDeviceProp
+#define cudaDeviceSynchronize musaDeviceSynchronize
+#define cudaError_t musaError_t
+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
+#define cudaEventCreateWithFlags musaEventCreateWithFlags
+#define cudaEventDisableTiming musaEventDisableTiming
+#define cudaEventRecord musaEventRecord
+#define cudaEventSynchronize musaEventSynchronize
+#define cudaEvent_t musaEvent_t
+#define cudaEventDestroy musaEventDestroy
+#define cudaFree musaFree
+#define cudaFreeHost musaFreeHost
+#define cudaGetDevice musaGetDevice
+#define cudaGetDeviceCount musaGetDeviceCount
+#define cudaGetDeviceProperties musaGetDeviceProperties
+#define cudaGetErrorString musaGetErrorString
+#define cudaGetLastError musaGetLastError
+#define cudaHostRegister musaHostRegister
+#define cudaHostRegisterPortable musaHostRegisterPortable
+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
+#define cudaHostUnregister musaHostUnregister
+#define cudaLaunchHostFunc musaLaunchHostFunc
+#define cudaMalloc musaMalloc
+#define cudaMallocHost musaMallocHost
+#define cudaMemcpy musaMemcpy
+#define cudaMemcpyAsync musaMemcpyAsync
+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
+#define cudaMemcpy2DAsync musaMemcpy2DAsync
+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
+#define cudaMemcpyKind musaMemcpyKind
+#define cudaMemset musaMemset
+#define cudaMemsetAsync musaMemsetAsync
+#define cudaMemGetInfo musaMemGetInfo
+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
+#define cudaSetDevice musaSetDevice
+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
+#define cudaStreamDestroy musaStreamDestroy
+#define cudaStreamFireAndForget musaStreamFireAndForget
+#define cudaStreamNonBlocking musaStreamNonBlocking
+#define cudaStreamPerThread musaStreamPerThread
+#define cudaStreamSynchronize musaStreamSynchronize
+#define cudaStreamWaitEvent musaStreamWaitEvent
+#define cudaStream_t musaStream_t
+#define cudaSuccess musaSuccess
+
+// XXX: Other CUDA => MUSA mapping
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
+#define CUdevice MUdevice
+#define CUdeviceptr MUdeviceptr
+#define CUmemAccessDesc MUmemAccessDesc
+#define CUmemAllocationProp MUmemAllocationProp
+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
+#define cuDeviceGet muDeviceGet
+#define cuDeviceGetAttribute muDeviceGetAttribute
+#define cuMemAddressFree muMemAddressFree
+#define cuMemAddressReserve muMemAddressReserve
+#define cuMemCreate muMemCreate
+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
+#define cuMemMap muMemMap
+#define cuMemRelease muMemRelease
+#define cuMemSetAccess muMemSetAccess
+#define cuMemUnmap muMemUnmap
+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
+#define cudaFuncSetAttribute musaFuncSetAttribute
+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
+#define make_cudaExtent make_musaExtent
+#define make_cudaPitchedPtr make_musaPitchedPtr
+
+// XXX: USE_CUDA_GRAPH
+#define CUDA_SUCCESS MUSA_SUCCESS
+#define CUresult MUresult
+#define cuGetErrorString muGetErrorString
+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
+#define cudaGraphDestroy musaGraphDestroy
+#define cudaGraphExecDestroy musaGraphExecDestroy
+#define cudaGraphExec_t musaGraphExec_t
+#define cudaGraphExecUpdate musaGraphExecUpdate
+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
+#define cudaGraphGetNodes musaGraphGetNodes
+#define cudaGraphInstantiate musaGraphInstantiate
+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
+#define cudaGraphLaunch musaGraphLaunch
+#define cudaGraphNodeGetType musaGraphNodeGetType
+#define cudaGraphNode_t musaGraphNode_t
+#define cudaGraphNodeType musaGraphNodeType
+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
+#define cudaGraph_t musaGraph_t
+#define cudaKernelNodeParams musaKernelNodeParams
+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
+#define cudaStreamEndCapture musaStreamEndCapture
+
+// XXX: cuBLAS => muBLAS mapping
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+
+// XXX: Clang builtins mapping
+#define __vsub4   __vsub4_musa
+#define __vcmpeq4 __vcmpeq4_musa
+#define __vcmpne4 __vcmpne4_musa
 #else
 #include <cuda_runtime.h>
 #include <cuda.h>
@@ -168,9 +316,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 
 #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
 
-#if CUDART_VERSION >= 12000
+#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
     static const char * cublas_get_error_str(const cublasStatus_t err) {
+#ifndef GGML_USE_MUSA
         return cublasGetStatusString(err);
+#else
+        return mublasStatus_to_string(err);
+#endif // GGML_USE_MUSA
     }
 #else
     static const char * cublas_get_error_str(const cublasStatus_t err) {
@@ -200,7 +352,7 @@ static const char * cu_get_error_str(CUresult err) {
 #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
 #endif
 
-#if CUDART_VERSION >= 11100
+#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
 #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
 #else
 #define GGML_CUDA_ASSUME(x)
@@ -214,6 +366,42 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif //GGML_CUDA_F16
 
+#if defined(GGML_USE_MUSA)
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
+
+static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
+static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
+    }
+    return c;
+}
+
+static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
+    unsigned int c;
+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
+    }
+    return c;
+}
+#endif // defined(GGML_USE_MUSA)
+
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 
@@ -455,7 +643,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
     const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
     return mask_low | mask_high;
 }
-#endif // CUDART_VERSION < 12000
+#endif // CUDART_VERSION < CUDART_HMASK
 
 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

From 4c676c85e59ef8f771f3a129e6eb217552139231 Mon Sep 17 00:00:00 2001
From: compilade <git@compilade.net>
Date: Sun, 28 Jul 2024 00:42:05 -0400
Subject: [PATCH 35/35] llama : refactor session file management (#8699)

* llama : refactor session file management

* llama : saving and restoring state checks for overflow

The size of the buffers should now be given to the functions working
with them, otherwise a truncated file could cause out of bound reads.

* llama : stream from session file instead of copying into a big buffer

Loading session files should no longer cause a memory usage spike.

* llama : llama_state_get_size returns the actual size instead of max

This is a breaking change, but makes that function *much* easier
to keep up to date, and it also makes it reflect the behavior
of llama_state_seq_get_size.

* llama : share code between whole and seq_id-specific state saving

Both session file types now use a more similar format.

* llama : no longer store all hparams in session files

Instead, the model arch name is stored.
The layer count and the embedding dimensions of the KV cache
are still verified when loading.
Storing all the hparams is not necessary.

* llama : fix uint64_t format type

* llama : various integer type cast and format string fixes

Some platforms use "%lu" and others "%llu" for uint64_t.
Not sure how to handle that, so casting to size_t when displaying errors.

* llama : remove _context suffix for llama_data_context

* llama : fix session file loading

llama_state_get_size cannot be used to get the max size anymore.

* llama : more graceful error handling of invalid session files

* llama : remove LLAMA_MAX_RNG_STATE

It's no longer necessary to limit the size of the RNG state,
because the max size of session files is not estimated anymore.

* llama : cast seq_id in comparison with unsigned n_seq_max
---
 examples/save-load-state/save-load-state.cpp |   20 +-
 include/llama.h                              |   23 +-
 src/llama.cpp                                | 1524 +++++++++---------
 3 files changed, 749 insertions(+), 818 deletions(-)

diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 00c2277ac..d8afdc141 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -47,7 +47,7 @@ int main(int argc, char ** argv) {
     // save state (rng, logits, embedding and kv_cache) to file
     {
         std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
-        const size_t written = llama_state_get_data(ctx, state_mem.data());
+        const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
 
         FILE *fp_write = fopen("dump_state.bin", "wb");
         fwrite(state_mem.data(), 1, written, fp_write);
@@ -99,13 +99,16 @@ int main(int argc, char ** argv) {
 
     // load state (rng, logits, embedding and kv_cache) from file
     {
-        std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
+        std::vector<uint8_t> state_mem;
 
         FILE * fp_read = fopen("dump_state.bin", "rb");
+        fseek(fp_read, 0, SEEK_END);
+        state_mem.resize(ftell(fp_read));
+        fseek(fp_read, 0, SEEK_SET);
         const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
         fclose(fp_read);
 
-        if (read != llama_state_set_data(ctx2, state_mem.data())) {
+        if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
             llama_free(ctx2);
             llama_free_model(model);
@@ -159,13 +162,16 @@ int main(int argc, char ** argv) {
 
     // load state (rng, logits, embedding and kv_cache) from file
     {
-        std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
+        std::vector<uint8_t> state_mem;
 
         FILE * fp_read = fopen("dump_state.bin", "rb");
+        fseek(fp_read, 0, SEEK_END);
+        state_mem.resize(ftell(fp_read));
+        fseek(fp_read, 0, SEEK_SET);
         const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
         fclose(fp_read);
 
-        if (read != llama_state_set_data(ctx3, state_mem.data())) {
+        if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
             fprintf(stderr, "\n%s : failed to read state\n", __func__);
             llama_free(ctx3);
             llama_free_model(model);
@@ -182,7 +188,7 @@ int main(int argc, char ** argv) {
     {
         // save kv of seq 0
         std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
-        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
+        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
         if (ncopy != seq_store.size()) {
             fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
             llama_free(ctx3);
@@ -196,7 +202,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s : kv cache cleared\n", __func__);
 
         // restore kv into seq 1
-        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
+        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
         if (nset != seq_store.size()) {
             fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
             llama_free(ctx3);
diff --git a/include/llama.h b/include/llama.h
index 413070d95..f23355a6b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -33,17 +33,15 @@
 
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 
-#define LLAMA_MAX_RNG_STATE (64*1024)
-
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 7
+#define LLAMA_SESSION_VERSION 8
 
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
-#define LLAMA_STATE_SEQ_VERSION 1
+#define LLAMA_STATE_SEQ_VERSION 2
 
 #ifdef __cplusplus
 extern "C" {
@@ -691,10 +689,11 @@ extern "C" {
     // State / sessions
     //
 
-    // Returns the maximum size in bytes of the state (rng, logits, embedding
-    // and kv_cache) - will often be smaller after compacting tokens
-    LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
-    LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
+    // Returns the *actual* size in bytes of the state
+    // (rng, logits, embedding and kv_cache)
+    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
+    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
         "use llama_state_get_size instead");
 
     // Copies the state to the specified destination address.
@@ -702,7 +701,8 @@ extern "C" {
     // Returns the number of bytes copied
     LLAMA_API size_t llama_state_get_data(
             struct llama_context * ctx,
-                         uint8_t * dst);
+                         uint8_t * dst,
+                          size_t   size);
     LLAMA_API DEPRECATED(size_t llama_copy_state_data(
             struct llama_context * ctx,
                          uint8_t * dst),
@@ -712,7 +712,8 @@ extern "C" {
     // Returns the number of bytes read
     LLAMA_API size_t llama_state_set_data(
             struct llama_context * ctx,
-                   const uint8_t * src);
+                   const uint8_t * src,
+                          size_t   size);
     LLAMA_API DEPRECATED(size_t llama_set_state_data(
             struct llama_context * ctx,
                    const uint8_t * src),
@@ -754,6 +755,7 @@ extern "C" {
     LLAMA_API size_t llama_state_seq_get_data(
             struct llama_context * ctx,
                          uint8_t * dst,
+                          size_t   size,
                     llama_seq_id   seq_id);
 
     // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@@ -763,6 +765,7 @@ extern "C" {
     LLAMA_API size_t llama_state_seq_set_data(
             struct llama_context * ctx,
                    const uint8_t * src,
+                          size_t   size,
                     llama_seq_id   dest_seq_id);
 
     LLAMA_API size_t llama_state_seq_save_file(
diff --git a/src/llama.cpp b/src/llama.cpp
index 0345d0062..a207451f5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2933,7 +2933,7 @@ static bool llama_kv_cache_init(
 
     // TODO: find a nicer way to add other recurrent model architectures
     cache.recurrent = model.arch == LLM_ARCH_MAMBA;
-    cache.v_trans   = !cparams.flash_attn;
+    cache.v_trans   = !cache.recurrent && !cparams.flash_attn;
 
     cache.head = 0;
     cache.size = kv_size;
@@ -17303,18 +17303,18 @@ void llama_kv_cache_update(struct llama_context * ctx) {
 }
 
 // deprecated
-size_t llama_get_state_size(const struct llama_context * ctx) {
+size_t llama_get_state_size(struct llama_context * ctx) {
     return llama_state_get_size(ctx);
 }
 
 // deprecated
 size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
-    return llama_state_get_data(ctx, dst);
+    return llama_state_get_data(ctx, dst, -1);
 }
 
 // deprecated
 size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    return llama_state_set_data(ctx, src);
+    return llama_state_set_data(ctx, src, -1);
 }
 
 // deprecated
@@ -17327,603 +17327,205 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
     return llama_state_save_file(ctx, path_session, tokens, n_token_count);
 }
 
-// Returns the *maximum* size of the state
-size_t llama_state_get_size(const struct llama_context * ctx) {
-    const auto & cparams = ctx->cparams;
-    const auto & hparams = ctx->model.hparams;
-
-    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
-    // for reference, std::mt19937(1337) serializes to 6701 bytes.
-    const size_t s_rng_size        = sizeof(size_t);
-    const size_t s_rng             = LLAMA_MAX_RNG_STATE;
-    const size_t s_n_outputs       = sizeof(size_t);
-    // assume worst case for outputs although only currently set ones are serialized
-    const size_t s_output_pos      = ctx->cparams.n_batch * sizeof(int32_t);
-    const size_t s_logits_size     = sizeof(size_t);
-    const size_t s_logits          = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0;
-    const size_t s_embedding_size  = sizeof(size_t);
-    const size_t s_embedding       = ctx->embd_size   ? cparams.n_batch * hparams.n_embd  * sizeof(float) : 0;
-    const size_t s_kv_buf_size     = sizeof(size_t);
-    const size_t s_kv_head         = sizeof(uint32_t);
-    const size_t s_kv_size         = sizeof(uint32_t);
-    const size_t s_kv_used         = sizeof(uint32_t);
-    const size_t s_v_trans         = sizeof(uint32_t);
-    const size_t s_kv              = ctx->kv_self.total_size();
-    const size_t s_kv_cell         = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id);
-    const size_t s_kv_cells        = ctx->kv_self.size * s_kv_cell;
-
-    const size_t s_total = (
-        + s_rng_size
-        + s_rng
-        + s_n_outputs
-        + s_output_pos
-        + s_logits_size
-        + s_logits
-        + s_embedding_size
-        + s_embedding
-        + s_kv_buf_size
-        + s_kv_head
-        + s_kv_size
-        + s_kv_used
-        + s_v_trans
-        + s_kv
-        + s_kv_cells
-    );
-
-    // on session change it is very likely that the state size has changed - so we need to update this function
-    static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
-
-    return s_total;
-}
-
-// llama_context_data
-struct llama_data_context {
+// TODO: replace all non-fatal assertions with returned errors or exceptions
+struct llama_data_write {
     virtual void write(const void * src, size_t size) = 0;
     virtual size_t get_size_written() = 0;
-    virtual ~llama_data_context() = default;
-};
+    virtual ~llama_data_write() = default;
 
-struct llama_data_buffer_context : llama_data_context {
-    uint8_t * ptr;
-    size_t size_written = 0;
+    void write_string(const std::string & str) {
+        uint32_t str_size = str.size();
 
-    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
-
-    void write(const void * src, size_t size) override {
-        memcpy(ptr, src, size);
-        ptr += size;
-        size_written += size;
+        write(&str_size,  sizeof(str_size));
+        write(str.data(), str_size);
     }
 
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
-
-struct llama_data_file_context : llama_data_context {
-    llama_file * file;
-    size_t size_written = 0;
-
-    llama_data_file_context(llama_file * f) : file(f) {}
-
-    void write(const void * src, size_t size) override {
-        file->write_raw(src, size);
-        size_written += size;
+    void write_model_info(const struct llama_context * ctx) {
+        std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
+        write_string(arch_str);
+        // TODO: add more model-specific info which should prevent loading the session file if not identical
     }
 
-    size_t get_size_written() override {
-        return size_written;
-    }
-};
-
-/** copy state data into either a buffer or file depending on the passed in context
- *
- * file context:
- * llama_file file("/path", "wb");
- * llama_data_file_context data_ctx(&file);
- * llama_state_get_data(ctx, &data_ctx);
- *
- * buffer context:
- * std::vector<uint8_t> buf(max_size, 0);
- * llama_data_buffer_context data_ctx(&buf.data());
- * llama_state_get_data(ctx, &data_ctx);
- *
-*/
-static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
-    llama_synchronize(ctx);
-
-    // copy rng
-    {
+    void write_rng(const std::mt19937 & rng) {
         std::ostringstream rng_ss;
-        rng_ss << ctx->sampling.rng;
+        rng_ss << rng;
 
-        const std::string & rng_str  = rng_ss.str();
-        const size_t        rng_size = rng_str.size();
+        const std::string & rng_str = rng_ss.str();
 
-        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
-
-        data_ctx->write(&rng_size,      sizeof(rng_size));
-        data_ctx->write(rng_str.data(), rng_size);
+        write_string(rng_str);
     }
 
-    // copy outputs
-    {
-        // Can't use ctx->n_outputs because it's not for the
-        // entire last batch when n_ubatch is smaller than n_batch
-        size_t n_outputs = 0;
+    void write_output_ids(const struct llama_context * ctx) {
+        const uint32_t n_outputs = ctx->n_outputs;
 
-        // copy output ids
-        {
-            std::vector<int32_t> output_pos;
-
-            const size_t    n_batch = ctx->cparams.n_batch;
-            const auto & output_ids = ctx->output_ids;
-
-            output_pos.resize(ctx->output_size);
-
-            // build a more compact representation of the output ids
-            for (size_t i = 0; i < n_batch; ++i) {
-                // map an output id to a position in the batch
-                int32_t pos = output_ids[i];
-                if (pos >= 0) {
-                    if ((size_t) pos >= n_outputs) {
-                        n_outputs = pos + 1;
-                    }
-                    GGML_ASSERT((size_t) pos < ctx->output_size);
-                    output_pos[pos] = i;
-                }
-            }
-
-            data_ctx->write(&n_outputs, sizeof(n_outputs));
-
-            if (n_outputs) {
-                data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t));
-            }
-        }
-
-        // copy logits
-        {
-            const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab);
-
-            data_ctx->write(&logits_size, sizeof(logits_size));
-
-            if (logits_size) {
-                data_ctx->write(ctx->logits, logits_size * sizeof(float));
-            }
-        }
-
-        // copy embeddings
-        {
-            const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd);
-
-            data_ctx->write(&embeddings_size, sizeof(embeddings_size));
-
-            if (embeddings_size) {
-                data_ctx->write(ctx->embd, embeddings_size * sizeof(float));
-            }
-        }
-    }
-
-    // copy kv cache
-    {
-        const auto & kv_self = ctx->kv_self;
-        const auto & hparams = ctx->model.hparams;
-
-        const uint32_t n_layer      = hparams.n_layer;
-
-        // NOTE: kv_size and kv_buf_size are mostly used for sanity checks
-        const uint32_t kv_head     = llama_kv_cache_cell_max(kv_self);
-        const uint32_t kv_size     = kv_self.size;
-        const size_t   kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head;
-        const uint32_t kv_used     = kv_self.used;
-        const uint32_t v_trans     = kv_self.v_trans ? 1 : 0;
-
-        data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
-        data_ctx->write(&kv_head,     sizeof(kv_head));
-        data_ctx->write(&kv_size,     sizeof(kv_size));
-        data_ctx->write(&kv_used,     sizeof(kv_used));
-        data_ctx->write(&v_trans,     sizeof(v_trans));
-
-        if (kv_buf_size) {
-            const size_t pre_kv_buf_size = data_ctx->get_size_written();
-
-            std::vector<uint8_t> tmp_buf;
-            for (int il = 0; il < (int) n_layer; ++il) {
-                const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
-
-                tmp_buf.resize(k_size);
-                ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
-                data_ctx->write(tmp_buf.data(), tmp_buf.size());
-
-                if (kv_self.recurrent || !kv_self.v_trans) {
-                    // v is contiguous for recurrent models
-                    // TODO: use other tensors for state models than k and v
-                    const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
-
-                    tmp_buf.resize(v_size);
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size());
-                    data_ctx->write(tmp_buf.data(), tmp_buf.size());
-                    continue;
-                }
-
-                // v is not contiguous, copy row by row
-                const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
-                const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
-
-                tmp_buf.resize(v_row_size);
-                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
-                    data_ctx->write(tmp_buf.data(), tmp_buf.size());
-                }
-            }
-            GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size);
-        }
-
-        for (uint32_t i = 0; i < kv_head; ++i) {
-            const auto & cell = kv_self.cells[i];
-
-            const llama_pos pos         = cell.pos;
-            const size_t    seq_id_size = cell.seq_id.size();
-
-            data_ctx->write(&pos,         sizeof(pos));
-            data_ctx->write(&seq_id_size, sizeof(seq_id_size));
-
-            for (auto seq_id : cell.seq_id) {
-                data_ctx->write(&seq_id, sizeof(seq_id));
-            }
-        }
-    }
-}
-
-size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
-    llama_data_buffer_context data_ctx(dst);
-    llama_state_get_data_internal(ctx, &data_ctx);
-
-    return data_ctx.get_size_written();
-}
-
-// Sets the state reading from the specified source address
-size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
-    llama_synchronize(ctx);
-
-    const uint8_t * inp = src;
-
-    // set rng
-    {
-        size_t rng_size;
-        memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
-
-        GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
-
-        std::string rng_str((const char *)inp, rng_size); inp += rng_size;
-
-        std::istringstream rng_ss(rng_str);
-        rng_ss >> ctx->sampling.rng;
-
-        GGML_ASSERT(!rng_ss.fail());
-    }
-
-    // set output ids
-    {
-        size_t n_outputs;
         std::vector<int32_t> output_pos;
 
-        memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs);
+        const size_t    n_batch = ctx->cparams.n_batch;
+        const auto & output_ids = ctx->output_ids;
 
-        GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs));
+        GGML_ASSERT(n_outputs <= ctx->output_size);
+
+        output_pos.resize(n_outputs);
+
+        // build a more compact representation of the output ids
+        for (size_t i = 0; i < n_batch; ++i) {
+            // map an output id to a position in the batch
+            int32_t pos = output_ids[i];
+            if (pos >= 0) {
+                GGML_ASSERT((uint32_t) pos < n_outputs);
+                output_pos[pos] = i;
+            }
+        }
+
+        write(&n_outputs, sizeof(n_outputs));
 
         if (n_outputs) {
-            output_pos.resize(n_outputs);
-            memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t));
-            inp += n_outputs * sizeof(int32_t);
-
-            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
-                int32_t id = output_pos[i];
-                GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch);
-                ctx->output_ids[id] = i;
-            }
-
-            ctx->n_outputs = n_outputs;
+            write(output_pos.data(), n_outputs * sizeof(int32_t));
         }
     }
 
-    // set logits
-    {
-        size_t logits_size;
+    void write_logits(const struct llama_context * ctx) {
+        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
 
-        memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
-
-        GGML_ASSERT(ctx->logits_size >= logits_size);
+        write(&logits_size, sizeof(logits_size));
 
         if (logits_size) {
-            memcpy(ctx->logits, inp, logits_size * sizeof(float));
-            inp += logits_size * sizeof(float);
+            write(ctx->logits, logits_size * sizeof(float));
         }
     }
 
-    // set embeddings
-    {
-        size_t embeddings_size;
+    void write_embeddings(const struct llama_context * ctx) {
+        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
 
-        memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
-
-        GGML_ASSERT(ctx->embd_size >= embeddings_size);
+        write(&embeddings_size, sizeof(embeddings_size));
 
         if (embeddings_size) {
-            memcpy(ctx->embd, inp, embeddings_size * sizeof(float));
-            inp += embeddings_size * sizeof(float);
+            write(ctx->embd, embeddings_size * sizeof(float));
         }
     }
 
-    // set kv cache
-    {
-        const auto & kv_self = ctx->kv_self;
-        const auto & hparams = ctx->model.hparams;
+    void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
 
-        const uint32_t n_layer      = hparams.n_layer;
+        for (const auto & range : cell_ranges) {
+            for (uint32_t i = range.first; i < range.second; ++i) {
+                const auto & cell = kv_self.cells[i];
+                const llama_pos pos      = cell.pos;
+                const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
 
-        size_t   kv_buf_size;
-        uint32_t kv_head;
-        uint32_t kv_size;
-        uint32_t kv_used;
-        uint32_t v_trans;
+                write(&pos,      sizeof(pos));
+                write(&n_seq_id, sizeof(n_seq_id));
 
-        memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
-        memcpy(&kv_head,     inp, sizeof(kv_head));     inp += sizeof(kv_head);
-        memcpy(&kv_size,     inp, sizeof(kv_size));     inp += sizeof(kv_size);
-        memcpy(&kv_used,     inp, sizeof(kv_used));     inp += sizeof(kv_used);
-        memcpy(&v_trans,     inp, sizeof(v_trans));     inp += sizeof(v_trans);
+                if (n_seq_id) {
+                    for (auto seq_id : cell.seq_id) {
+                        write(&seq_id, sizeof(seq_id));
+                    }
+                }
+            }
+        }
+    }
 
-        GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition
+    void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
+        const struct llama_kv_cache & kv_self = ctx->kv_self;
+        const struct llama_hparams & hparams = ctx->model.hparams;
 
-        if (kv_self.size != kv_size) {
-            // the KV cache needs to be big enough to load all the KV cells from the saved state
-            GGML_ASSERT(kv_self.size >= kv_head);
+        const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
+        const uint32_t n_layer = hparams.n_layer;
 
-            LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n",
-                __func__, kv_head, kv_size, kv_self.size);
+        write(&v_trans, sizeof(v_trans));
+        write(&n_layer, sizeof(n_layer));
+
+        std::vector<uint8_t> tmp_buf;
+
+        // Iterate and write all the keys first, each row is a cell
+        // Get whole range at a time
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+            // Write key type
+            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            write(&k_type_i, sizeof(k_type_i));
+
+            // Write row size of key
+            const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            write(&k_size_row, sizeof(k_size_row));
+
+            // Read each range of cells of k_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                tmp_buf.resize(range_size * k_size_row);
+                ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
+                write(tmp_buf.data(), tmp_buf.size());
+            }
         }
 
-        llama_kv_cache_clear(ctx);
-
-        if (kv_buf_size) {
-            const size_t pre_kv_buf_size = inp - src;
-
-            GGML_ASSERT(kv_self.total_size() >= kv_buf_size);
-
-            for (int il = 0; il < (int) n_layer; ++il) {
-                const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+        if (!kv_self.v_trans) {
+            for (uint32_t il = 0; il < n_layer; ++il) {
                 const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
-                const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+                // Write value type
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                write(&v_type_i, sizeof(v_type_i));
 
-                ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
-                inp += k_size;
+                // Write row size of value
+                const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                write(&v_size_row, sizeof(v_size_row));
 
-                if (kv_self.recurrent || !kv_self.v_trans) {
-                    // v is contiguous for recurrent models
-                    // TODO: use other tensors for state models than k and v
-                    const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
-
-                    ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size);
-                    inp += v_size;
-                    continue;
-                }
-
-                // v is not contiguous, copy row by row
-                const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
-                const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size);
-
-                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
-                    inp += v_row_size;
+                // Read each range of cells of v_size length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    tmp_buf.resize(range_size * v_size_row);
+                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
+                    write(tmp_buf.data(), tmp_buf.size());
                 }
             }
-            GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size);
-        }
+        } else {
+            // When v is transposed, we also need the element size and get the element ranges from each row
+            const uint32_t kv_size = kv_self.size;
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
-        ctx->kv_self.head = kv_head;
-        ctx->kv_self.used = kv_used;
+                // Write value type
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                write(&v_type_i, sizeof(v_type_i));
 
-        for (uint32_t i = 0; i < kv_head; ++i) {
-            llama_pos pos;
-            size_t    seq_id_size;
+                // Write element size
+                const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                write(&v_size_el, sizeof(v_size_el));
 
-            memcpy(&pos,         inp, sizeof(pos));         inp += sizeof(pos);
-            memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
+                // Write GQA embedding size
+                write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
 
-            ctx->kv_self.cells[i].pos = pos;
-
-            llama_seq_id seq_id;
-
-            for (size_t j = 0; j < seq_id_size; ++j) {
-                memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
-                ctx->kv_self.cells[i].seq_id.insert(seq_id);
+                // For each row, we get the element values of each cell
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    // Read each range of cells of v_size_el length each into tmp_buf and write out
+                    for (const auto & range : cell_ranges) {
+                        const size_t range_size = range.second - range.first;
+                        const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                        tmp_buf.resize(range_size * v_size_el);
+                        ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
+                        write(tmp_buf.data(), tmp_buf.size());
+                    }
+                }
             }
         }
     }
 
-    const size_t nread    = inp - src;
-    const size_t max_size = llama_state_get_size(ctx);
+    void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
+        const struct llama_kv_cache & kv_self = ctx->kv_self;
+        std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+        uint32_t cell_count = 0;
 
-    GGML_ASSERT(nread <= max_size);
-
-    return nread;
-}
-
-static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    llama_file file(path_session, "rb");
-
-    // sanity checks
-    {
-        const uint32_t magic   = file.read_u32();
-        const uint32_t version = file.read_u32();
-
-        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
-            LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-            return false;
-        }
-
-        llama_hparams session_hparams;
-        file.read_raw(&session_hparams, sizeof(llama_hparams));
-
-        if (session_hparams != ctx->model.hparams) {
-            LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
-            return false;
-        }
-    }
-
-    // load the prompt
-    {
-        const uint32_t n_token_count = file.read_u32();
-
-        if (n_token_count > n_token_capacity) {
-            LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
-            return false;
-        }
-
-        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-        *n_token_count_out = n_token_count;
-    }
-
-    // restore the context state
-    {
-        const size_t n_state_size_cur = file.size - file.tell();
-        const size_t n_state_size_max = llama_state_get_size(ctx);
-
-        if (n_state_size_cur > n_state_size_max) {
-            LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
-            return false;
-        }
-
-        std::vector<uint8_t> state_data(n_state_size_max);
-        file.read_raw(state_data.data(), n_state_size_cur);
-
-        llama_state_set_data(ctx, state_data.data());
-    }
-
-    return true;
-}
-
-bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    try {
-        return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
-        return false;
-    }
-}
-
-static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    llama_file file(path_session, "wb");
-
-    file.write_u32(LLAMA_SESSION_MAGIC);
-    file.write_u32(LLAMA_SESSION_VERSION);
-
-    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
-
-    // save the prompt
-    file.write_u32((uint32_t) n_token_count);
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
-    // save the context state using stream saving
-    llama_data_file_context data_ctx(&file);
-    llama_state_get_data_internal(ctx, &data_ctx);
-
-    return true;
-}
-
-bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    try {
-        return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
-        return false;
-    }
-}
-
-size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
-    // save the size of size_t as a uint32_t for safety check
-    const size_t size_t_size_size = sizeof(uint32_t);
-
-    // other values
-    const size_t s_cell_count_size = sizeof(uint32_t);
-    const size_t s_layer_count_size = sizeof(uint32_t);
-    const size_t n_embd_v_gqa_size = sizeof(uint32_t);
-
-    size_t s_cell_count = 0;
-    size_t s_cell_data_size = 0;
-    const auto & kv_self = ctx->kv_self;
-    const auto & hparams = ctx->model.hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    for (uint32_t i = 0; i < kv_self.size; ++i) {
-        const auto & cell = kv_self.cells[i];
-        if (cell.seq_id.count(seq_id) > 0) {
-            ++s_cell_count;
-            s_cell_data_size += sizeof(llama_pos);
-        }
-    }
-
-    for (int il = 0; il < (int)n_layer; ++il) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-        // types of keys and values
-        s_cell_data_size += sizeof(int32_t) * 2;
-        // k_size_row and v_size_el values of layer
-        s_cell_data_size += sizeof(size_t) * 2;
-
-        // keys
-        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
-        s_cell_data_size += k_size_row * s_cell_count;
-
-        // values (transposed)
-        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
-        s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
-    }
-
-    const size_t s_total = (
-        size_t_size_size +
-        s_cell_count_size +
-        s_layer_count_size +
-        n_embd_v_gqa_size +
-        s_cell_data_size
-        );
-
-    return s_total;
-}
-
-static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
-    llama_synchronize(ctx);
-
-    const auto & kv_self = ctx->kv_self;
-    GGML_ASSERT(!kv_self.recurrent); // not implemented
-
-    // Save the size of size_t as a uint32_t for safety check
-    const uint32_t size_t_size = sizeof(size_t);
-    data_ctx.write(&size_t_size, sizeof(size_t_size));
-
-    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
-    uint32_t cell_count = 0;
-
-    // Count the number of cells with the specified seq_id
-    // Find all the ranges of cells with this seq id
-    {
+        // Count the number of cells with the specified seq_id
+        // Find all the ranges of cells with this seq id (or all, when -1)
         uint32_t cell_range_begin = kv_self.size;
         for (uint32_t i = 0; i < kv_self.size; ++i) {
             const auto & cell = kv_self.cells[i];
-            if (cell.has_seq_id(seq_id)) {
+            if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
                 ++cell_count;
                 if (cell_range_begin == kv_self.size) {
                     cell_range_begin = i;
                 }
-            }
-            else {
+            } else {
                 if (cell_range_begin != kv_self.size) {
                     cell_ranges.emplace_back(cell_range_begin, i);
                     cell_range_begin = kv_self.size;
@@ -17940,301 +17542,622 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
             cell_count_check += range.second - range.first;
         }
         GGML_ASSERT(cell_count == cell_count_check);
+
+        write(&cell_count, sizeof(cell_count));
+
+        write_kv_cache_meta(kv_self, cell_ranges, seq_id);
+        write_kv_cache_data(ctx, cell_ranges);
+    }
+};
+
+struct llama_data_read {
+    virtual const uint8_t * read(size_t size) = 0;
+    virtual void read_to(void * dst, size_t size) = 0;
+    virtual size_t get_size_read() = 0;
+    virtual ~llama_data_read() = default;
+
+    void read_string(std::string & str) {
+        uint32_t str_size;
+        read_to(&str_size, sizeof(str_size));
+
+        str.assign((const char *) read(str_size), str_size);
     }
 
-    // Write the cell count
-    data_ctx.write(&cell_count, sizeof(cell_count));
-
-    const auto & hparams = ctx->model.hparams;
-    const uint32_t n_layer = hparams.n_layer;
-
-    // Write the layer count
-    data_ctx.write(&n_layer, sizeof(n_layer));
-
-    // Write n_embd_v_gqa (reference value)
-    {
-        const uint32_t n_embd_v_gqa_ref = hparams.n_embd_v_gqa() + hparams.n_embd_k_s();
-        data_ctx.write(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+    // validate model information
+    void read_model_info(const struct llama_context * ctx) {
+        std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
+        std::string arch_str;
+        read_string(arch_str);
+        if (cur_arch_str != arch_str) {
+            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
+        }
+        // TODO: add more info which needs to be identical but which is not verified otherwise
     }
 
-    // Iterate the ranges and write all the pos (this is the token position in the prompt)
-    for (const auto & range : cell_ranges) {
-        for (uint32_t i = range.first; i < range.second; ++i) {
-            const auto & cell = kv_self.cells[i];
-            data_ctx.write(&cell.pos, sizeof(cell.pos));
+    void read_rng(std::mt19937 & rng) {
+        std::string rng_str;
+        read_string(rng_str);
+
+        std::istringstream rng_ss(rng_str);
+        rng_ss >> rng;
+
+        if (rng_ss.fail()) {
+            throw std::runtime_error("failed to load RNG state");
         }
     }
 
-    // Iterate and write all the keys first, each row is a cell
-    // Get whole range at a time
-    std::vector<uint8_t> tmp_buf;
-    for (int il = 0; il < (int)n_layer; ++il) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+    void read_output_ids(struct llama_context * ctx) {
+        std::vector<int32_t> output_pos;
 
-        // Write key type
-        const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
-        data_ctx.write(&k_type_i, sizeof(k_type_i));
+        uint32_t n_outputs;
+        read_to(&n_outputs, sizeof(n_outputs));
 
-        // Write row size of key
-        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
-        data_ctx.write(&k_size_row, sizeof(k_size_row));
+        if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
+            throw std::runtime_error("could not reserve outputs");
+        }
 
-        // Read each range of cells of k_size length each into tmp_buf and write out
-        for (const auto & range : cell_ranges) {
-            const size_t range_size = range.second - range.first;
-            tmp_buf.resize(range_size * k_size_row);
-            ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
-            data_ctx.write(tmp_buf.data(), tmp_buf.size());
+        if (n_outputs) {
+            output_pos.resize(n_outputs);
+            read_to(output_pos.data(), n_outputs * sizeof(int32_t));
+
+            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
+                int32_t id = output_pos[i];
+                if ((uint32_t) id >= ctx->cparams.n_batch) {
+                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
+                }
+                ctx->output_ids[id] = i;
+            }
+
+            ctx->n_outputs = n_outputs;
         }
     }
 
-    // TODO: simplify, reduce copy-paste
-    if (!kv_self.v_trans) {
-        for (int il = 0; il < (int)n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+    void read_logits(struct llama_context * ctx) {
+        uint64_t logits_size;
+        read_to(&logits_size, sizeof(logits_size));
 
-            // Write value type
-            const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
-            data_ctx.write(&v_type_i, sizeof(v_type_i));
+        if (ctx->logits_size < logits_size) {
+            throw std::runtime_error("logits buffer too small");
+        }
 
-            // Write row size of value
-            const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
-            data_ctx.write(&v_size_row, sizeof(v_size_row));
+        if (logits_size) {
+            read_to(ctx->logits, logits_size * sizeof(float));
+        }
+    }
 
-            // Read each range of cells of v_size length each into tmp_buf and write out
-            for (const auto & range : cell_ranges) {
-                const size_t range_size = range.second - range.first;
-                tmp_buf.resize(range_size * v_size_row);
-                ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
-                data_ctx.write(tmp_buf.data(), tmp_buf.size());
+    void read_embeddings(struct llama_context * ctx) {
+        uint64_t embeddings_size;
+        read_to(&embeddings_size, sizeof(embeddings_size));
+
+        if (ctx->embd_size < embeddings_size) {
+            throw std::runtime_error("embeddings buffer too small");
+        }
+
+        if (embeddings_size) {
+            read_to(ctx->embd, embeddings_size * sizeof(float));
+        }
+    }
+
+    bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
+        struct llama_kv_cache & kv_self = ctx->kv_self;
+
+        if (dest_seq_id != -1) {
+            // single sequence
+
+            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+
+            llama_batch batch = llama_batch_init(cell_count, 0, 1);
+            batch.n_tokens = cell_count;
+            for (uint32_t i = 0; i < cell_count; ++i) {
+                llama_pos pos;
+                uint32_t n_seq_id;
+
+                read_to(&pos, sizeof(pos));
+                read_to(&n_seq_id, sizeof(n_seq_id));
+
+                if (n_seq_id != 0) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                    return false;
+                }
+
+                batch.pos[i] = pos;
+                batch.n_seq_id[i] = 1;
+                batch.seq_id[i][0] = dest_seq_id;
+            }
+            if (!llama_kv_cache_find_slot(kv_self, batch)) {
+                llama_batch_free(batch);
+                LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+                return false;
+            }
+
+            // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+            // Assume that this is one contiguous block of cells
+            GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
+            GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
+            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+            GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
+            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
+
+            // Cleanup
+            llama_batch_free(batch);
+        } else {
+            // whole KV cache restore
+
+            if (cell_count > kv_self.size) {
+                LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+                return false;
+            }
+
+            llama_kv_cache_clear(kv_self);
+
+            for (uint32_t i = 0; i < cell_count; ++i) {
+                llama_kv_cell & cell = kv_self.cells[i];
+
+                llama_pos pos;
+                uint32_t  n_seq_id;
+
+                read_to(&pos,      sizeof(pos));
+                read_to(&n_seq_id, sizeof(n_seq_id));
+
+                cell.pos = pos;
+
+                for (uint32_t j = 0; j < n_seq_id; ++j) {
+                    llama_seq_id seq_id;
+                    read_to(&seq_id, sizeof(seq_id));
+
+                    if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                        return false;
+                    }
+
+                    cell.seq_id.insert(seq_id);
+                }
+            }
+
+            kv_self.head = 0;
+            kv_self.used = cell_count;
+        }
+
+        return true;
+    }
+
+    bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
+        const struct llama_hparams & hparams = ctx->model.hparams;
+        struct llama_kv_cache & kv_self = ctx->kv_self;
+        uint32_t v_trans;
+        uint32_t n_layer;
+        read_to(&v_trans, sizeof(v_trans));
+        read_to(&n_layer, sizeof(n_layer));
+
+        if (n_layer != hparams.n_layer) {
+            LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+            return false;
+        }
+        if (cell_count > kv_self.size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
+            return false;
+        }
+        if (kv_self.v_trans != (bool) v_trans) {
+            LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+            return false;
+        }
+
+        // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+            // Read type of key
+            int32_t k_type_i_ref;
+            read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+            const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
+            if (k_type_i != k_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of key
+            uint64_t k_size_row_ref;
+            read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+            const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
+            if (k_size_row != k_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the keys for the whole cell range
+                ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
             }
         }
-    } else {
-        // For the values, they are transposed, so we also need the element size and get the element ranges from each row
-        const uint32_t kv_size = kv_self.size;
-        for (int il = 0; il < (int)n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
-            // Write value type
-            const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
-            data_ctx.write(&v_type_i, sizeof(v_type_i));
+        if (!kv_self.v_trans) {
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
 
-            // Write element size
-            const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
-            data_ctx.write(&v_size_el, sizeof(v_size_el));
+                // Read type of value
+                int32_t v_type_i_ref;
+                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                if (v_type_i != v_type_i_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                    return false;
+                }
 
-            // For each row, we get the element values of each cell
-            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                // Read each range of cells of v_size_el length each into tmp_buf and write out
-                for (const auto & range : cell_ranges) {
-                    const size_t range_size = range.second - range.first;
-                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                    tmp_buf.resize(range_size * v_size_el);
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
-                    data_ctx.write(tmp_buf.data(), tmp_buf.size());
+                // Read row size of value
+                uint64_t v_size_row_ref;
+                read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+                const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
+                if (v_size_row != v_size_row_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                    return false;
+                }
+
+                if (cell_count) {
+                    // Read and set the values for the whole cell range
+                    ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
+                }
+            }
+        } else {
+            // For each layer, read the values for each cell (transposed)
+            for (uint32_t il = 0; il < n_layer; ++il) {
+                const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+                // Read type of value
+                int32_t v_type_i_ref;
+                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+                const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
+                if (v_type_i != v_type_i_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                    return false;
+                }
+
+                // Read element size of value
+                uint32_t v_size_el_ref;
+                read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+                const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
+                if (v_size_el != v_size_el_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                    return false;
+                }
+
+                // Read GQA embedding size
+                uint32_t n_embd_v_gqa_ref;
+                read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+                if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                    LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                    return false;
+                }
+
+                if (cell_count) {
+                    // For each row in the transposed matrix, read the values for the whole cell range
+                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                        const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
+                        ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                    }
                 }
             }
         }
+        return true;
     }
 
+    void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
+        uint32_t cell_count;
+        read_to(&cell_count, sizeof(cell_count));
+
+        bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
+
+        if (!res) {
+            if (seq_id == -1) {
+                llama_kv_cache_clear(ctx);
+            } else {
+                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
+            }
+            throw std::runtime_error("failed to restore kv cache");
+        }
+    }
+};
+
+struct llama_data_write_dummy : llama_data_write {
+    size_t size_written = 0;
+
+    llama_data_write_dummy() {}
+
+    // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
+
+    void write(const void * /* src */, size_t size) override {
+        size_written += size;
+    }
+
+    size_t get_size_written() override {
+        return size_written;
+    }
+};
+
+struct llama_data_write_buffer : llama_data_write {
+    uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_written = 0;
+
+    llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+
+    void write(const void * src, size_t size) override {
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        memcpy(ptr, src, size);
+        ptr += size;
+        size_written += size;
+        buf_size -= size;
+    }
+
+    size_t get_size_written() override {
+        return size_written;
+    }
+};
+
+struct llama_data_read_buffer : llama_data_read {
+    const uint8_t * ptr;
+    size_t buf_size = 0;
+    size_t size_read = 0;
+
+    llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
+
+    const uint8_t * read(size_t size) override {
+        const uint8_t * base_ptr = ptr;
+        if (size > buf_size) {
+            throw std::runtime_error("unexpectedly reached end of buffer");
+        }
+        ptr += size;
+        size_read += size;
+        buf_size -= size;
+        return base_ptr;
+    }
+
+    void read_to(void * dst, size_t size) override {
+        memcpy(dst, read(size), size);
+    }
+
+    size_t get_size_read() override {
+        return size_read;
+    }
+};
+
+struct llama_data_write_file : llama_data_write {
+    llama_file * file;
+    size_t size_written = 0;
+
+    llama_data_write_file(llama_file * f) : file(f) {}
+
+    void write(const void * src, size_t size) override {
+        file->write_raw(src, size);
+        size_written += size;
+    }
+
+    size_t get_size_written() override {
+        return size_written;
+    }
+};
+
+struct llama_data_read_file : llama_data_read {
+    llama_file * file;
+    size_t size_read = 0;
+    std::vector<uint8_t> temp_buffer;
+
+    llama_data_read_file(llama_file * f) : file(f) {}
+
+    void read_to(void * dst, size_t size) override {
+        file->read_raw(dst, size);
+        size_read += size;
+    }
+
+    const uint8_t * read(size_t size) override {
+        temp_buffer.resize(size);
+        read_to(temp_buffer.data(), size);
+        return temp_buffer.data();
+    }
+
+    size_t get_size_read() override {
+        return size_read;
+    }
+};
+
+/** copy state data into either a buffer or file depending on the passed in context
+ *
+ * file context:
+ * llama_file file("/path", "wb");
+ * llama_data_write_file data_ctx(&file);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+ * buffer context:
+ * std::vector<uint8_t> buf(max_size, 0);
+ * llama_data_write_buffer data_ctx(buf.data(), max_size);
+ * llama_state_get_data_internal(ctx, data_ctx);
+ *
+*/
+static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
+    llama_synchronize(ctx);
+
+    data_ctx.write_model_info(ctx);
+
+    data_ctx.write_rng(ctx->sampling.rng);
+
+    // copy outputs
+    data_ctx.write_output_ids(ctx);
+    data_ctx.write_logits(ctx);
+    data_ctx.write_embeddings(ctx);
+
+    data_ctx.write_kv_cache(ctx);
+
     return data_ctx.get_size_written();
 }
 
-size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
-    llama_data_buffer_context data_ctx(dst);
+size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
+    llama_data_write_buffer data_ctx(dst, size);
+    try {
+        return llama_state_get_data_internal(ctx, data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+// Returns the *actual* size of the state.
+// Intended to be used when saving to state to a buffer.
+size_t llama_state_get_size(struct llama_context * ctx) {
+    llama_data_write_dummy data_ctx;
+    try {
+        return llama_state_get_data_internal(ctx, data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
+    llama_synchronize(ctx);
+
+    data_ctx.read_model_info(ctx);
+
+    // set rng
+    data_ctx.read_rng(ctx->sampling.rng);
+
+    // set outputs
+    data_ctx.read_output_ids(ctx);
+    data_ctx.read_logits(ctx);
+    data_ctx.read_embeddings(ctx);
+
+    data_ctx.read_kv_cache(ctx);
+
+    return data_ctx.get_size_read();
+}
+
+// Sets the state reading from the specified source address
+size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
+    llama_data_read_buffer data_ctx(src, size);
+    try {
+        return llama_state_set_data_internal(ctx, data_ctx);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(path_session, "rb");
+
+    // sanity checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
+            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+            return false;
+        }
+    }
+
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return false;
+        }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
+
+    // restore the context state
+    {
+        const size_t n_state_size_cur = file.size - file.tell();
+
+        llama_data_read_file data_ctx(&file);
+        const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
+
+        if (n_read != n_state_size_cur) {
+            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
+            return false;
+        }
+    }
+    return true;
+}
+
+bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    try {
+        return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
+        return false;
+    }
+}
+
+static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(path_session, "wb");
+
+    file.write_u32(LLAMA_SESSION_MAGIC);
+    file.write_u32(LLAMA_SESSION_VERSION);
+
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    // save the context state using stream saving
+    llama_data_write_file data_ctx(&file);
+    llama_state_get_data_internal(ctx, data_ctx);
+
+    return true;
+}
+
+bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    try {
+        return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
+        return false;
+    }
+}
+
+static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
+    llama_synchronize(ctx);
+
+    data_ctx.write_kv_cache(ctx, seq_id);
+
+    return data_ctx.get_size_written();
+}
+
+size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
+    llama_data_write_dummy data_ctx;
     return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
 }
 
-size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
+size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
+    llama_data_write_buffer data_ctx(dst, size);
+    try {
+        return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
+        return 0;
+    }
+}
+
+static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
     llama_synchronize(ctx);
 
-    auto & kv_self = ctx->kv_self;
-    GGML_ASSERT(!kv_self.recurrent); // not implemented
+    data_ctx.read_kv_cache(ctx, dest_seq_id);
 
-    // Wipe the slot
-    llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
+    return data_ctx.get_size_read();
+}
 
-    const uint8_t * inp = src;
-
-    // Read size of size_t
-    uint32_t size_t_size;
-    memcpy(&size_t_size, inp, sizeof(size_t_size));
-    inp += sizeof(size_t_size);
-    if (size_t_size != sizeof(size_t)) {
-        LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
+size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
+    llama_data_read_buffer data_ctx(src, size);
+    try {
+        return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
         return 0;
     }
-
-    // Read the cell count
-    uint32_t cell_count;
-    memcpy(&cell_count, inp, sizeof(cell_count));
-    inp += sizeof(cell_count);
-
-    // Read the layer count
-    uint32_t n_layer_ref;
-    memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
-    inp += sizeof(n_layer_ref);
-
-    // Read n_embd_v_gqa
-    uint32_t n_embd_v_gqa_ref;
-    memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
-    inp += sizeof(n_embd_v_gqa_ref);
-
-    // Sanity check model compatibility
-    const auto & hparams = ctx->model.hparams;
-    const uint32_t n_layer = hparams.n_layer;
-
-    if (n_layer != n_layer_ref) {
-        LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
-        return 0;
-    }
-
-    if (hparams.n_embd_v_gqa() != n_embd_v_gqa_ref) {
-        LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, hparams.n_embd_v_gqa(), n_embd_v_gqa_ref);
-        return 0;
-    }
-
-    // Allocate the new cells for the slot
-    if (cell_count) {
-        llama_batch batch = llama_batch_init(cell_count, 0, 1);
-        batch.n_tokens = cell_count;
-        for (uint32_t i = 0; i < cell_count; ++i) {
-            llama_pos pos;
-            memcpy(&pos, inp, sizeof(pos));
-            inp += sizeof(pos);
-
-            batch.pos[i] = pos;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id[i][0] = dest_seq_id;
-        }
-        if (!llama_kv_cache_find_slot(kv_self, batch)) {
-            llama_batch_free(batch);
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
-            return 0;
-        }
-
-        // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
-        // Assume that this is one contiguous block of cells
-        GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
-        GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
-        GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
-        GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
-        GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
-
-        // Cleanup
-        llama_batch_free(batch);
-    }
-
-    const uint32_t kv_size = kv_self.size;
-    const uint32_t kv_head = kv_self.head;
-
-    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
-    for (int il = 0; il < (int)n_layer; ++il) {
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
-
-        // Read type of key
-        int32_t k_type_i_ref;
-        memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
-        inp += sizeof(k_type_i_ref);
-        const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
-        if (k_type_i != k_type_i_ref) {
-            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
-            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
-            return 0;
-        }
-
-        // Read row size of key
-        size_t k_size_row_ref;
-        memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
-        inp += sizeof(k_size_row_ref);
-        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
-        if (k_size_row != k_size_row_ref) {
-            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
-            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
-            return 0;
-        }
-
-        if (cell_count) {
-            // Read and set the keys for the whole cell range
-            ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
-            inp += cell_count * k_size_row;
-        }
-    }
-
-    // TODO: simplify, reduce copy-paste
-    if (!kv_self.v_trans) {
-        for (int il = 0; il < (int)n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
-            inp += sizeof(v_type_i_ref);
-            const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
-            if (v_type_i != v_type_i_ref) {
-                llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return 0;
-            }
-
-            // Read row size of value
-            size_t v_size_row_ref;
-            memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref));
-            inp += sizeof(v_size_row_ref);
-            const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
-            if (v_size_row != v_size_row_ref) {
-                llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
-                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il);
-                return 0;
-            }
-
-            if (cell_count) {
-                // Read and set the values for the whole cell range
-                ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row);
-                inp += cell_count * v_size_row;
-            }
-        }
-    } else {
-        // For each layer, read the values for each cell (transposed)
-        for (int il = 0; il < (int)n_layer; ++il) {
-            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
-
-            // Read type of value
-            int32_t v_type_i_ref;
-            memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
-            inp += sizeof(v_type_i_ref);
-            const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
-            if (v_type_i != v_type_i_ref) {
-                llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
-                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
-                return 0;
-            }
-
-            // Read element size of value
-            size_t v_size_el_ref;
-            memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
-            inp += sizeof(v_size_el_ref);
-            const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
-            if (v_size_el != v_size_el_ref) {
-                llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
-                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
-                return 0;
-            }
-
-            if (cell_count) {
-                // For each row in the transposed matrix, read the values for the whole cell range
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
-                    ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
-                    inp += cell_count * v_size_el;
-                }
-            }
-        }
-    }
-
-    const size_t nread = inp - src;
-
-    return nread;
 }
 
 static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
@@ -18244,11 +18167,11 @@ static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, con
     file.write_u32(LLAMA_STATE_SEQ_VERSION);
 
     // save the prompt
-    file.write_u32((uint32_t)n_token_count);
+    file.write_u32((uint32_t) n_token_count);
     file.write_raw(tokens, sizeof(llama_token) * n_token_count);
 
     // save the context state using stream saving
-    llama_data_file_context data_ctx(&file);
+    llama_data_write_file data_ctx(&file);
     llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
 
     const size_t res = file.tell();
@@ -18286,9 +18209,8 @@ static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, con
     // restore the context state
     {
         const size_t state_size = file.size - file.tell();
-        std::vector<uint8_t> state_data(state_size);
-        file.read_raw(state_data.data(), state_size);
-        const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
+        llama_data_read_file data_ctx(&file);
+        const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
         if (!nread) {
             LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
             return 0;
@@ -18304,7 +18226,7 @@ size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepa
     try {
         return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
+        LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
         return 0;
     }
 }
@@ -18313,7 +18235,7 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
     try {
         return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
+        LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
         return 0;
     }
 }