Merge remote-tracking branch 'origin/master' into llama-model-params

2023-09-28 20:50:19 +02:00 · 2023-09-28 20:50:19 +02:00 · c8a9658e65
commit c8a9658e65
parent 17e841ac22 0e76a8992c
30 changed files with 7015 additions and 2423 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -457,22 +457,22 @@ jobs:
          path: |
            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-  freeBSD-latest:
+#  freeBSD-latest:
-    runs-on: macos-12
+#    runs-on: macos-12
-    steps:
+#    steps:
-    - name: Clone
+#    - name: Clone
-      uses: actions/checkout@v3
+#      uses: actions/checkout@v3
-
+#
-    - name: Build
+#    - name: Build
-      uses: cross-platform-actions/action@v0.19.0
+#      uses: cross-platform-actions/action@v0.19.0
-      with:
+#      with:
-        operating_system: freebsd
+#        operating_system: freebsd
-        version: '13.2'
+#        version: '13.2'
-        hypervisor: 'qemu'
+#        hypervisor: 'qemu'
-        run: |
+#        run: |
-            sudo pkg update
+#            sudo pkg update
-            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
-            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
+#            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
--- a/.gitignore
+++ b/.gitignore
@ -52,6 +52,8 @@ models-mnt
 /server
 /simple
 /batched
 /export-lora
 /finetune
 /speculative
 /parallel
 /train-text-from-scratch
--- a/15
+++ b/15
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel finetune export-lora tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@ -500,6 +500,9 @@ console.o: common/console.cpp common/console.h
 grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 train.o: common/train.cpp common/train.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
@ -550,7 +553,7 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
@ -559,12 +562,18 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
+baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -9,6 +9,8 @@ add_library(${TARGET} OBJECT
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
    train.h
    train.cpp
    )
 if (BUILD_SHARED_LIBS)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
-static void process_escapes(std::string& input) {
+void process_escapes(std::string& input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@ -361,7 +361,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.lora_adapter = argv[i];
+            params.lora_adapter.push_back({argv[i], 1.0f});
            params.use_mmap = false;
        } else if (arg == "--lora-scaled") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            const char * lora_adapter = argv[i];
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
            params.use_mmap = false;
        } else if (arg == "--lora-base") {
            if (++i >= argc) {
@ -707,6 +719,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
@ -802,10 +815,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        return std::make_tuple(nullptr, nullptr);
    }
-    if (!params.lora_adapter.empty()) {
+    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
        const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
        float lora_scale = std::get<1>(params.lora_adapter[i]);
        int err = llama_model_apply_lora_from_file(model,
-                                             params.lora_adapter.c_str(),
+                                             lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             lora_scale,
                                             ((i > 0) || params.lora_base.empty())
                                                ? NULL
                                                : params.lora_base.c_str(),
                                             params.n_threads);
        if (err != 0) {
            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
@ -1258,7 +1276,20 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
        fprintf(stream, "  %d: %f", lb.first, lb.second);
    }
-    fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
+    fprintf(stream, "lora:\n");
    for (std::tuple<std::string, float> la : params.lora_adapter) {
        if (std::get<1>(la) != 1.0f) {
            continue;
        }
        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
    }
    fprintf(stream, "lora_scaled:\n");
    for (std::tuple<std::string, float> la : params.lora_adapter) {
        if (std::get<1>(la) == 1.0f) {
            continue;
        }
        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@ -86,8 +86,8 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
-    std::string lora_adapter = "";  // lora adapter path
+    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
-    std::string lora_base    = "";  // base model path for the lora adapter
+    std::string lora_base  = "";                              // base model path for the lora adapter
    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@ -130,6 +130,8 @@ std::string get_system_info(const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 void process_escapes(std::string& input);
 //
 // Model utils
 //
--- a/common/train.cpp
+++ b/common/train.cpp
--- a/common/train.h
+++ b/common/train.h
@ -0,0 +1,230 @@
 // Various helper functions and utilities for training
 #pragma once
 #include <string>
 #include <random>
 #include <vector>
 #include "ggml.h"
 #include "llama.h"
 typedef std::string mt19937_state;
 struct train_state {
    struct ggml_opt_context * opt;
    uint64_t train_its;
    uint64_t train_samples;
    uint64_t train_tokens;
    uint64_t train_epochs;
    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
    mt19937_state shuffle_rng_state_current;
    mt19937_state shuffle_rng_state_next;
    size_t        shuffle_sample_count;
    size_t        shuffle_next_sample;
 };
 struct train_params_common {
    const char * fn_train_data;
    const char * fn_checkpoint_in;
    const char * fn_checkpoint_out;
    const char * pattern_fn_it;
    const char * fn_latest;
    bool print_usage;
    int save_every;
    uint32_t seed;
    int n_ctx;
    int n_threads;
    int n_batch;
    int n_gradient_accumulation;
    int n_epochs;
    bool custom_n_ctx;
    bool use_flash;
    bool use_checkpointing;
    std::string sample_start;
    bool include_sample_start;
    bool escape;
    bool overlapping_samples;
    bool fill_with_next_samples;
    bool separate_with_eos;
    bool separate_with_bos;
    bool sample_random_offsets;
    bool force_reshuffle;
    int   warmup;
    int   cos_decay_steps;
    float cos_decay_restart;
    float cos_decay_min;
    bool  enable_restart;
    int   opt_past;
    float opt_delta;
    int   opt_max_no_improvement;
    int   adam_n_iter;
    float adam_alpha;
    float adam_min_alpha;
    float adam_decay;
    int   adam_decay_min_ndim;
    float adam_beta1;
    float adam_beta2;
    float adam_gclip;
    float adam_eps_f;
 };
 typedef void (*save_train_files_callback)(void * data, struct train_state * train);
 struct train_opt_callback_data {
    struct train_params_common * params;
    struct train_state         * train;
    save_train_files_callback    save_cb;
    void                       * save_data;
    struct llama_context       * lctx;
    int                          last_save_iter;
    llama_token                * tokens_data;
    size_t                       tokens_size;
    size_t                     * samples_begin;
    size_t                     * samples_size;
    size_t                     * shuffled_samples_offs;
    size_t                     * shuffled_samples_begin;
    size_t                     * shuffled_samples_size;
    size_t                       samples_count;
    struct ggml_tensor         * tokens_input;
    struct ggml_tensor         * target_probs;
    int                          first_iter;
    int                          first_epoch;
    int                          iter_at_last_epoch;
    int64_t                      last_time;
    double                       millis_per_iter;
 };
 struct train_state * init_train_state();
 void free_train_state(struct train_state  * state);
 struct train_params_common get_default_train_params_common();
 void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
 bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
 void finish_processing_train_args(struct train_params_common * params);
 struct random_normal_distribution;
 struct random_uniform_distribution;
 struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
 struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
 void free_random_normal_distribution (struct random_normal_distribution  * rnd);
 void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
 struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
 // generate random float in interval [0,1)
 float frand();
 float frand_normal (struct random_normal_distribution * rnd);
 float frand_uniform(struct random_uniform_distribution * rnd);
 int   clamp (const int v, const int min, const int max);
 float fclamp(const float v, const float min, const float max);
 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
 size_t tokenize_file(
        struct llama_context     * lctx,
        const char               * filename,
        const std::string        & sample_start,
        bool                       include_sample_start,
        bool                       overlapping_samples,
        unsigned                   context_length,
        std::vector<llama_token> & out_tokens,
        std::vector<size_t>      & out_samples_begin,
        std::vector<size_t>      & out_samples_size);
 int64_t get_example_targets_batch(
        struct llama_context * lctx,
        struct ggml_tensor   * tokens_input,
        struct ggml_tensor   * target_probs,
        int64_t                example_id,
        const size_t         * samples_offs,
        const size_t         * samples_begin,
        const size_t         * samples_size,
              size_t           samples_count,
        const llama_token    * train_data,
        size_t                 n_train_data,
        bool                   separate_with_eos,
        bool                   separate_with_bos,
        bool                   fill_with_next_samples,
        bool                   sample_random_offsets);
 void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
 mt19937_state mt19937_get_state(const std::mt19937& rng);
 mt19937_state mt19937_seed_to_state(unsigned seed);
 mt19937_state shuffle_samples(
        const mt19937_state & rng_state,
        size_t              * shuffled_offs,
        size_t              * shuffled_begins,
        size_t              * shuffled_sizes,
        const size_t        * begins,
        const size_t        * sizes,
        size_t                count);
 size_t hash_combine(size_t h1, size_t h2);
 size_t compute_samples_hash(
    const char* fn,
    const size_t* samples_begin,
    const size_t* samples_size,
    size_t sample_count);
 std::string replace_str(const char * s, const char * needle, const char * replacement);
 void print_duration(double milliseconds);
 float cosine_decay(
    int64_t step,
    int64_t decay_steps,
    float   minimum);
 float cosine_decay_restart(
    int64_t step,
    int64_t decay_steps,
    float   minimum,
    float   restart_step_mult);
 float learning_schedule(
    int64_t step,
    int64_t warmup_steps,
    int64_t decay_steps,
    float   learning_rate,
    float   overall_minimum,
    float   cos_decay_minimum,
    float   cos_decay_restart_step_mult,
    bool    enable_restart);
 void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
 void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
 void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
 bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
 void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
 std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
 void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype)
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 tokenizer_json_file = dir_model / 'tokenizer.json'
 if not tokenizer_json_file.is_file():
@ -177,12 +175,8 @@ for i in range(vocab_size):
        text = bytearray(pad_token)
    tokens.append(text)
    scores.append(0.0)                      # dymmy
    toktypes.append(gguf.TokenType.NORMAL)  # dummy
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype)
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 tokenizer_json_file = dir_model / 'tokenizer.json'
 if not tokenizer_json_file.is_file():
@ -161,12 +159,8 @@ for i in range(vocab_size):
        text = bytearray(pad_token)
    tokens.append(text)
    scores.append(0.0)                      # dymmy
    toktypes.append(gguf.TokenType.NORMAL)  # dummy
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -21,6 +21,7 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(finetune)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(batched)
@ -35,4 +36,5 @@ else()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
    add_subdirectory(export-lora)
 endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1,4 +1,5 @@
 #include "ggml.h"
 #include "train.h"
 #include <vector>
 #include <cassert>
 #include <random>
@ -14,31 +15,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 constexpr float rms_norm_eps = 5e-6f;
 #endif
 static float frand() {
    return (float)rand()/(float)RAND_MAX;
 }
 struct random_normal_distribution {
    std::mt19937 gen;
    std::normal_distribution<float> nd;
    float min;
    float max;
 };
 static void init_random_normal_distribution(
    struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
 ) {
    rnd->gen = std::mt19937(seed);
    rnd->nd = std::normal_distribution<float>{mean, std};
    rnd->min = min;
    rnd->max = max;
 }
 static float frand_normal(struct random_normal_distribution * rnd) {
    const float r = rnd->nd(rnd->gen);
    return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
 }
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
@ -93,54 +69,6 @@ static struct ggml_tensor * randomize_tensor(
    return tensor;
 }
 static struct ggml_tensor * randomize_tensor_normal(
    struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
 ) {
    float scale = 1.0; // xavier
    switch (ndims) {
        case 1:
            scale /= sqrtf(ne[0]);
            for (int i0 = 0; i0 < ne[0]; i0++) {
                ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
            }
            break;
        case 2:
            scale /= sqrtf(ne[0]+ne[1]);
            for (int i1 = 0; i1 < ne[1]; i1++) {
                for (int i0 = 0; i0 < ne[0]; i0++) {
                    ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
                }
            }
            break;
        case 3:
            scale /= sqrtf(ne[0]+ne[1]);
            for (int i2 = 0; i2 < ne[2]; i2++) {
                for (int i1 = 0; i1 < ne[1]; i1++) {
                    for (int i0 = 0; i0 < ne[0]; i0++) {
                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
                    }
                }
            }
            break;
        case 4:
            scale /= sqrtf(ne[0]+ne[1]);
            for (int i3 = 0; i3 < ne[3]; i3++) {
                for (int i2 = 0; i2 < ne[2]; i2++) {
                    for (int i1 = 0; i1 < ne[1]; i1++) {
                        for (int i0 = 0; i0 < ne[0]; i0++) {
                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
                        }
                    }
                }
            }
            break;
        default:
            assert(false);
    };
    return tensor;
 }
 struct llama_hparams {
    uint32_t n_vocab = 32000;
    uint32_t n_ctx   = 512;   // this is provided as user input?
@ -398,27 +326,29 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
    const uint32_t n_layer = hparams.n_layer;
-    struct random_normal_distribution rnd;
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+
-    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+    randomize_tensor_normal(model->tok_embeddings , rnd);
-    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
+    randomize_tensor_normal(model->norm           , rnd);
-    randomize_tensor_normal(model->output,         model->output->n_dims,         model->output->ne,         &rnd);
+    randomize_tensor_normal(model->output         , rnd);
    for (uint32_t i = 0; i < n_layer; ++i) {
        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+        randomize_tensor_normal(layer.attention_norm, rnd);
-        randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
+        randomize_tensor_normal(layer.wq, rnd);
-        randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
+        randomize_tensor_normal(layer.wk, rnd);
-        randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
+        randomize_tensor_normal(layer.wv, rnd);
-        randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
+        randomize_tensor_normal(layer.wo, rnd);
-        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, rnd);
-        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+        randomize_tensor_normal(layer.w3, rnd);
    }
    free_random_normal_distribution(rnd);
 }
@ -429,32 +359,34 @@ static void randomize_model_lora(
    const uint32_t n_layer = hparams.n_layer;
-    struct random_normal_distribution rnd;
+    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+
-    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+    randomize_tensor_normal(model->tok_embeddings, rnd);
-    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
+    randomize_tensor_normal(model->norm          , rnd);
-    randomize_tensor_normal(model->outputa,        model->outputa->n_dims,        model->outputa->ne,         &rnd);
+    randomize_tensor_normal(model->outputa       , rnd);
-    randomize_tensor_normal(model->outputb,        model->outputb->n_dims,        model->outputb->ne,         &rnd);
+    randomize_tensor_normal(model->outputb       , rnd);
    for (uint32_t i = 0; i < n_layer; ++i) {
        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
+        randomize_tensor_normal(layer.attention_norm, rnd);
-        randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
+        randomize_tensor_normal(layer.wqa, rnd);
-        randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
+        randomize_tensor_normal(layer.wqb, rnd);
-        randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
+        randomize_tensor_normal(layer.wka, rnd);
-        randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
+        randomize_tensor_normal(layer.wkb, rnd);
-        randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
+        randomize_tensor_normal(layer.wva, rnd);
-        randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
+        randomize_tensor_normal(layer.wvb, rnd);
-        randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
+        randomize_tensor_normal(layer.woa, rnd);
-        randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
+        randomize_tensor_normal(layer.wob, rnd);
-        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
+        randomize_tensor_normal(layer.ffn_norm, rnd);
-        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
+        randomize_tensor_normal(layer.w3, rnd);
    }
    free_random_normal_distribution(rnd);
 }
 static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
@ -762,32 +694,6 @@ static struct ggml_tensor * forward(
    return inpL;
 }
 static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
 }
 static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
 }
 static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
 }
 static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
    GGML_ASSERT(tensor->ne[3] == ne3);
 }
 static struct ggml_tensor * forward_batch(
    struct llama_model    * model,
    struct llama_kv_cache * cache,
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@ -0,0 +1,26 @@
 # export-lora
 Apply LORA adapters to base model and export the resulting model.
 ```
 usage: export-lora [options]
 options:
  -h, --help                         show this help message and exit
  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
  -o FNAME, --model-out FNAME        path to save exported model (default '')
  -l FNAME, --lora FNAME             apply LoRA adapter
  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
  -t N, --threads N                  number of threads to use during computation (default: 4)
 ```
 For example:
 ```bash
 ./bin/export-lora \
    -m open-llama-3b-v2-q8_0.gguf \
    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
 ```
 Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -0,0 +1,474 @@
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include <vector>
 #include <string>
 #include <thread>
 static const size_t tensor_alignment = 32;
 struct lora_info {
    std::string filename;
    float scale;
 };
 struct export_lora_params {
    std::string fn_model_base;
    std::string fn_model_out;
    std::vector<struct lora_info> lora;
    int n_threads;
 };
 struct lora_data {
    struct lora_info     info;
    std::vector<uint8_t> data;
    struct ggml_context * ctx;
    uint32_t lora_r;
    uint32_t lora_alpha;
 };
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    size_t size;
    llama_file(const char * fname, const char * mode) {
        fp = std::fopen(fname, mode);
        if (fp == NULL) {
            size = 0;
        } else {
            seek(0, SEEK_END);
            size = tell();
            seek(0, SEEK_SET);
        }
    }
    size_t tell() const {
 #ifdef _WIN32
        __int64 ret = _ftelli64(fp);
 #else
        long ret = std::ftell(fp);
 #endif
        GGML_ASSERT(ret != -1); // this really shouldn't fail
        return (size_t) ret;
    }
    void seek(size_t offset, int whence) {
 #ifdef _WIN32
        int ret = _fseeki64(fp, (__int64) offset, whence);
 #else
        int ret = std::fseek(fp, (long) offset, whence);
 #endif
        GGML_ASSERT(ret == 0); // same
    }
    void read_raw(void * ptr, size_t size) {
        if (size == 0) {
            return;
        }
        errno = 0;
        std::size_t ret = std::fread(ptr, size, 1, fp);
        if (ferror(fp)) {
            die_fmt("read error: %s", strerror(errno));
        }
        if (ret != 1) {
            die("unexpectedly reached end of file");
        }
    }
    std::uint32_t read_u32() {
        std::uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
    }
    std::string read_string(std::uint32_t len) {
        std::vector<char> chars(len);
        read_raw(chars.data(), len);
        return std::string(chars.data(), len);
    }
    void write_raw(const void * ptr, size_t size) {
        if (size == 0) {
            return;
        }
        errno = 0;
        size_t ret = std::fwrite(ptr, size, 1, fp);
        if (ret != 1) {
            die_fmt("write error: %s", strerror(errno));
        }
    }
    void write_u32(std::uint32_t val) {
        write_raw(&val, sizeof(val));
    }
    bool eof() {
        return tell() >= size;
    }
    ~llama_file() {
        if (fp) {
            std::fclose(fp);
        }
    }
 };
 static struct export_lora_params get_default_export_lora_params() {
    struct export_lora_params result;
    result.fn_model_base = "";
    result.fn_model_out  = "";
    result.n_threads = GGML_DEFAULT_N_THREADS;
    return result;
 }
 static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
 }
 static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
    bool invalid_param = false;
    std::string arg;
    struct export_lora_params default_params = get_default_export_lora_params();
    const std::string arg_prefix = "--";
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
        if (arg == "-m" || arg == "--model-base") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params->fn_model_base = argv[i];
        } else if (arg == "-o" || arg == "--model-out") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params->fn_model_out = argv[i];
        } else if (arg == "-l" || arg == "--lora") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            struct lora_info lora;
            lora.filename = argv[i];
            lora.scale = 1.0f;
            params->lora.push_back(lora);
        } else if (arg == "-s" || arg == "--lora-scaled") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            struct lora_info lora;
            lora.filename = argv[i];
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            lora.scale = std::stof(argv[i]);
            params->lora.push_back(lora);
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params->n_threads = std::stoi(argv[i]);
            if (params->n_threads <= 0) {
                params->n_threads = std::thread::hardware_concurrency();
            }
        } else {
            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
            export_lora_print_usage(argc, argv, &default_params);
            exit(1);
        }
    }
    if (params->fn_model_base == default_params.fn_model_base) {
        fprintf(stderr, "error: please specify a filename for model-base.\n");
        export_lora_print_usage(argc, argv, &default_params);
        exit(1);
    }
    if (params->fn_model_out == default_params.fn_model_out) {
        fprintf(stderr, "error: please specify a filename for model-out.\n");
        export_lora_print_usage(argc, argv, &default_params);
        exit(1);
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
        export_lora_print_usage(argc, argv, &default_params);
        exit(1);
    }
    return true;
 }
 static void free_lora(struct lora_data * lora) {
    if (lora->ctx != NULL) {
        ggml_free(lora->ctx);
    }
    delete lora;
 }
 static struct lora_data * load_lora(struct lora_info * info) {
    struct lora_data * result = new struct lora_data;
    result->info = *info;
    result->ctx = NULL;
    result->lora_r     = 1;
    result->lora_alpha = 1;
    struct llama_file file(info->filename.c_str(), "rb");
    if (file.fp == NULL) {
        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
            info->filename.c_str());
        free_lora(result);
        return NULL;
    }
    struct ggml_init_params params_ggml;
    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_MAX_NODES;
    params_ggml.mem_buffer = NULL;
    params_ggml.no_alloc   = true;
    result->ctx = ggml_init(params_ggml);
    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
    uint32_t magic   = file.read_u32();
    if (magic != LLAMA_FILE_MAGIC_LORA) {
        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
    }
    uint32_t version = file.read_u32();
    if (version != 1) {
        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
    }
    result->lora_r     = file.read_u32();
    result->lora_alpha = file.read_u32();
    // read tensor infos from file
    std::vector<char> name_buf;
    std::vector<struct ggml_tensor *> tensors;
    std::vector<size_t> tensors_offset;
    size_t total_nbytes_pad = 0;
    while(!file.eof()) {
        int64_t ne[4]   = {1,1,1,1};
        uint32_t n_dims  = file.read_u32();
        uint32_t namelen = file.read_u32();
        uint32_t type    = file.read_u32();
        for (uint32_t k = 0; k < n_dims; ++k) {
            ne[k] = (int64_t)file.read_u32();
        }
        name_buf.clear();
        name_buf.resize(namelen + 1, '\0');
        file.read_raw(name_buf.data(), namelen);
        file.seek((0-file.tell()) & 31, SEEK_CUR);
        size_t offset = file.tell();
        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
        ggml_set_name(tensor, name_buf.data());
        size_t nbytes     = ggml_nbytes(tensor);
        size_t nbytes_pad = ggml_nbytes_pad(tensor);
        total_nbytes_pad += nbytes_pad;
        tensors.push_back(tensor);
        tensors_offset.push_back(offset);
        file.seek(nbytes, SEEK_CUR);
    }
    // read tensor data
    result->data.resize(total_nbytes_pad);
    size_t data_offset = 0;
    for (size_t i = 0; i < tensors.size(); ++i) {
        struct ggml_tensor * tensor = tensors[i];
        size_t offset     = tensors_offset[i];
        size_t nbytes     = ggml_nbytes(tensor);
        size_t nbytes_pad = ggml_nbytes_pad(tensor);
        file.seek(offset, SEEK_SET);
        tensor->data = result->data.data() + data_offset;
        file.read_raw(tensor->data, nbytes);
        data_offset += nbytes_pad;
    }
    return result;
 }
 static struct ggml_cgraph * build_graph_lora(
    struct ggml_context * ctx,
    struct ggml_tensor * tensor,
    struct ggml_tensor * lora_a,
    struct ggml_tensor * lora_b,
    float scaling
 ) {
    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
    if (scaling != 1.0f) {
        ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
    }
    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
    struct ggml_cgraph * gf = ggml_new_graph(ctx);
    ggml_build_forward_expand (gf, res);
    return gf;
 }
 static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
    if (lora->ctx == NULL) {
        return false;
    }
    std::string name = ggml_get_name(tensor);
    std::string name_a = name + std::string(".loraA");
    std::string name_b = name + std::string(".loraB");
    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
    if (lora_a == NULL || lora_b == NULL) {
        return false;
    }
    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
    struct ggml_init_params params;
    params.mem_size   = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
    params.mem_buffer = NULL;
    params.no_alloc   = true;
    struct ggml_context * ctx = NULL;
    struct ggml_allocr * alloc = NULL;
    struct ggml_cgraph * gf = NULL;
    ctx   = ggml_init(params);
    alloc = ggml_allocr_new_measure(tensor_alignment);
    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
    size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
    ggml_allocr_free(alloc);
    ggml_free(ctx);
    static std::vector<uint8_t> data_compute;
    data_compute.resize(alloc_size + tensor_alignment);
    ctx   = ggml_init(params);
    alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
    ggml_allocr_alloc_graph(alloc, gf);
    ggml_allocr_free(alloc);
    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
    static std::vector<uint8_t> data_work;
    data_work.resize(cplan.work_size);
    cplan.work_data = data_work.data();
    ggml_graph_compute(gf, &cplan);
    ggml_free(ctx);
    return true;
 }
 static void export_lora(struct export_lora_params * params) {
    // load all loras
    std::vector<struct lora_data *> loras;
    for (size_t i = 0; i < params->lora.size(); ++i) {
        struct lora_data * lora = load_lora(&params->lora[i]);
        if (lora != NULL) {
            loras.push_back(lora);
        }
    }
    if (loras.size() == 0) {
        fprintf(stderr, "warning: no lora adapters will be applied.\n");
    }
    // open input file
    struct llama_file fin(params->fn_model_base.c_str(), "rb");
    if (!fin.fp) {
        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
    }
    // open base model gguf, read tensors without their data
    struct ggml_context * ctx_in;
    struct gguf_init_params params_gguf;
    params_gguf.no_alloc = true;
    params_gguf.ctx      = &ctx_in;
    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
    // create new gguf
    struct gguf_context * gguf_out = gguf_init_empty();
    // copy meta data from base model: kv and tensors
    gguf_set_kv(gguf_out, gguf_in);
    int n_tensors = gguf_get_n_tensors(gguf_in);
    for (int i=0; i < n_tensors; ++i) {
        const char * name = gguf_get_tensor_name(gguf_in, i);
        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
        gguf_add_tensor(gguf_out, tensor);
    }
    // create output file
    struct llama_file fout(params->fn_model_out.c_str(), "wb");
    if (!fout.fp) {
        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
    }
    // write gguf meta data
    std::vector<uint8_t> meta;
    meta.resize(gguf_get_meta_size(gguf_out));
    gguf_get_meta_data(gguf_out, meta.data());
    fout.write_raw(meta.data(), meta.size());
    std::vector<uint8_t> data;
    std::vector<uint8_t> padding;
    for (int i=0; i < n_tensors; ++i) {
        const char * name = gguf_get_tensor_name(gguf_in, i);
        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
        // read tensor data
        data.resize(ggml_nbytes(tensor));
        tensor->data = data.data();
        size_t offset = gguf_get_tensor_offset(gguf_in, i);
        fin.seek(offset + meta.size(), SEEK_SET);
        fin.read_raw(data.data(), data.size());
        // apply all loras
        for (size_t k = 0; k < loras.size(); ++k) {
            apply_lora(tensor, loras[k], params->n_threads);
        }
        // write tensor data + padding
        padding.clear();
        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
        GGML_ASSERT(fout.tell() == offset + meta.size());
        // fout.seek(offset + meta.size(), SEEK_SET);
        fout.write_raw(data.data(), data.size());
        fout.write_raw(padding.data(), padding.size());
        if (i % 2 == 0) {
            printf(".");
        }
    }
    printf("\n");
    // close gguf
    gguf_free(gguf_out);
    gguf_free(gguf_in);
    // free loras
    for (size_t i = 0; i < loras.size(); ++i) {
        free_lora(loras[i]);
    }
 }
 int main(int argc, char ** argv) {
    struct export_lora_params params = get_default_export_lora_params();
    if (!export_lora_params_parse(argc, argv, &params)) {
        return 1;
    }
    export_lora(&params);
    return 0;
 }
--- a/examples/finetune/CMakeLists.txt
+++ b/examples/finetune/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET finetune)
 add_executable(${TARGET} finetune.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -0,0 +1,90 @@
 # finetune
 Basic usage instructions:
 ```bash
 # get training data
 wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
 # finetune LORA adapter
 ./bin/finetune \
        --model-base open-llama-3b-v2-q8_0.gguf \
        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
        --train-data "shakespeare.txt" \
        --save-every 10 \
        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
        --use-checkpointing
 # predict
 ./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```
 Finetune output files will be saved every N iterations (config with `--save-every N`).
 The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
 So in above example after 10 iterations these files will be written:
 - chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
 - chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
 - lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
 - lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 After 10 more iterations:
 - chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
 - chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
 - lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
 - lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
 llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
 These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
 In `main` you can also load multiple LORA adapters, which will then be mixed together.
 For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
 ```bash
 ./bin/main -m open-llama-3b-v2-q8_0.gguf \
  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
 ```
 You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
 For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
 ```bash
 ./bin/main -m open-llama-3b-v2-q8_0.gguf \
  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
 ```
 The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
 The default LORA rank can be specified with `--lora-r N`.
 The LORA rank can be configured for each model tensor type separately with these command line options:
 ```bash
  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
  --rank-out-norm N          LORA rank for output norm tensor (default 1)
  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
  --rank-out N               LORA rank for output tensor (default 4)
  --rank-wq N                LORA rank for wq tensor (default 4)
  --rank-wk N                LORA rank for wk tensor (default 4)
  --rank-wv N                LORA rank for wv tensor (default 4)
  --rank-wo N                LORA rank for wo tensor (default 4)
  --rank-w1 N                LORA rank for w1 tensor (default 4)
  --rank-w2 N                LORA rank for w2 tensor (default 4)
  --rank-w3 N                LORA rank for w3 tensor (default 4)
 ```
 The LORA rank of 'norm' tensors should always be 1.
 To see all available options use `finetune --help`.
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@ -0,0 +1,489 @@
 #!/usr/bin/env python3
 # finetune checkpoint --> gguf conversion
 import argparse
 import gguf
 import os
 import struct
 import sys
 import numpy as np
 from pathlib import Path
 # gguf constants
 LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
 LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
 LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
 LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
 LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
 LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
 LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
 LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
 LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
 LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
 LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
 LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
 LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
 LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
 LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
 LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
 LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
 LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
 LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
 LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
 LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
 LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
 LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
 LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
 LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
 LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
 LLM_KV_TRAINING_TYPE               = "training.type"
 LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
 LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
 LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
 LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
 LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
 LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
 LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
 LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
 LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
 LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
 LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
 LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
 LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
 LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
 LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
 LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
 class Tensor:
    def __init__(self, dtype='f', ne=None):
        if ne is None:
            ne = []
        self.dtype = dtype
        self.ne = ne
        self.nbytes = 0
        if self.dtype == 'f':
            if len(self.ne) == 0:
                self.nbytes = 0
            else:
                self.nbytes = int(np.product(self.ne)) * 4
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
    def load(self, data, offset):
        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        assert(nd == len(self.ne))
        ne = []
        for d in range(nd):
            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
            ne.append(n)
        if tuple(ne) != tuple(self.ne):
            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
        if self.dtype == 'f':
            assert(dtype == 0)
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
        self.name = bytes(data[offset:offset+namelen]); offset += namelen
        # 32-byte alignment
        offset += (0 - offset) & 31
        self.data = data[offset:offset+self.nbytes]
        offset += self.nbytes
        return offset
    def max_storage_size(self):
        result = 0
        result += 4 # nd
        result += 4 # namelen
        result += 4 # dtype
        result += len(self.ne)*8 # ne
        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
        result += 31 # 32-byte alignment
        result += self.nbytes
        return result
    def save_gguf(self, gguf_writer, name):
        gguf_writer.add_tensor(
            name=name,
            tensor=self.data,
            raw_shape=np.array(list(reversed(self.ne))),
            raw_dtype=gguf.GGMLQuantizationType.F32)
 class OptimizationContext:
    def __init__(self):
        pass
    def load(self, data, offset):
        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
        offset += 4
        if self.version != 1:
            raise ValueError('Invalid version of optimization context in checkpoint file')
        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
        self.adam_m  = Tensor('f', [self.nx])
        self.adam_v  = Tensor('f', [self.nx])
        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
        self.lbfgs_x    = Tensor('f', [self.nx])
        self.lbfgs_xp   = Tensor('f', [self.nx])
        self.lbfgs_g    = Tensor('f', [self.nx])
        self.lbfgs_gp   = Tensor('f', [self.nx])
        self.lbfgs_d    = Tensor('f', [self.nx])
        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
        # forgot to save type in version 1:
        # guess self.type from number of remaining bytes
        size_type_0 = 12 + sum([t.max_storage_size() for t in
                                [self.adam_m, self.adam_v]
                                +([self.adam_pf] if (self.past > 0) else [])])
        size_type_1 = 24 + sum([t.max_storage_size() for t in
                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
                                 self.lbfgs_lmal, self.lbfgs_lmys,
                                 self.lbfgs_lms, self.lbfgs_lmy]
                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
        # due to alignment padding the size might not by exact
        # but the difference in size for both types is significant,
        # so we can just use whichever is closest
        remaining = len(data) - offset
        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
            self.type = 0
        else:
            self.type = 1
        if self.type == 0:
            offset = self.adam_m.load(data, offset)
            offset = self.adam_v.load(data, offset)
            offset = self.adam_pf.load(data,offset)
            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        elif self.type == 1:
            offset = self.lbfgs_x.load(data, offset)
            offset = self.lbfgs_xp.load(data, offset)
            offset = self.lbfgs_g.load(data, offset)
            offset = self.lbfgs_gp.load(data, offset)
            offset = self.lbfgs_d.load(data, offset)
            offset = self.lbfgs_pf.load(data, offset)
            offset = self.lbfgs_lmal.load(data, offset)
            offset = self.lbfgs_lmys.load(data, offset)
            offset = self.lbfgs_lms.load(data, offset)
            offset = self.lbfgs_lmy.load(data, offset)
            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
        else:
            raise ValueError(f"Invalid optimizer type '{self.type}'")
        return offset
    def save_gguf(self, gguf_writer):
        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
        if self.type == 0:
            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
            if self.past > 0:
                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
        elif self.type == 1:
            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
            if self.past > 0:
                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
        else:
            raise ValueError('Unknown optimizer type')
 class LoraParams:
    def __init__(self):
        pass
    def load(self, data, offset):
        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        return offset
    def save_gguf(self, gguf_writer):
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
 class ModelParams:
    def __init__(self, n_ff = None):
        self.n_ff = n_ff
    def load(self, data, offset):
        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
        return offset
    def get_n_ff(self):
        if self.n_ff is None:
            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
        else:
            return self.n_ff
    def save_gguf(self, gguf_writer):
        # self.n_vocab not saved
        gguf_writer.add_embedding_length(self.n_embd)
        gguf_writer.add_head_count(self.n_head)
        gguf_writer.add_block_count(self.n_layer)
        gguf_writer.add_rope_dimension_count(self.n_rot)
        gguf_writer.add_feed_forward_length(self.get_n_ff())
 def tensor_name(key, bid=None, suffix=".weight"):
    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
 class Layer:
    def __init__(self, params, lora_params, bid):
        self.bid = bid
        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
    def load(self, data, offset):
        offset = self.att_norm_a.load(data, offset)
        offset = self.att_norm_b.load(data, offset)
        offset = self.wq_a.load(data, offset)
        offset = self.wq_b.load(data, offset)
        offset = self.wk_a.load(data, offset)
        offset = self.wk_b.load(data, offset)
        offset = self.wv_a.load(data, offset)
        offset = self.wv_b.load(data, offset)
        offset = self.wo_a.load(data, offset)
        offset = self.wo_b.load(data, offset)
        offset = self.ffn_norm_a.load(data, offset)
        offset = self.ffn_norm_b.load(data, offset)
        offset = self.w1_a.load(data, offset)
        offset = self.w1_b.load(data, offset)
        offset = self.w2_a.load(data, offset)
        offset = self.w2_b.load(data, offset)
        offset = self.w3_a.load(data, offset)
        offset = self.w3_b.load(data, offset)
        return offset
    def save_gguf(self, gguf_writer):
        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
 class LoraModel:
    def __init__(self, n_ff = None):
        self.params = ModelParams(n_ff = n_ff)
        self.lora_params = LoraParams()
        self.layers = []
    def load(self, data, offset):
        offset = self.params.load(data, offset)
        offset = self.lora_params.load(data, offset)
        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
        offset = self.tok_embd_a.load(data, offset)
        offset = self.tok_embd_b.load(data, offset)
        offset = self.norm_a.load(data, offset)
        offset = self.norm_b.load(data, offset)
        offset = self.output_a.load(data, offset)
        offset = self.output_b.load(data, offset)
        self.layers.clear()
        for bid in range(self.params.n_layer):
            layer = Layer(self.params, self.lora_params, bid)
            offset = layer.load(data, offset)
            self.layers.append(layer)
        return offset
    def save_gguf(self, gguf_writer):
        self.params.save_gguf(gguf_writer)
        self.lora_params.save_gguf(gguf_writer)
        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
        for layer in self.layers:
            layer.save_gguf(gguf_writer)
 class LoraCheckpoint:
    def __init__(self, n_ff = None):
        self.model = LoraModel(n_ff = n_ff)
        self.opt_ctx = OptimizationContext()
    def load(self, data, offset):
        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
        if magic != b'ggcl':
            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        if self.version != 0:
            raise ValueError('Invalid version of checkpoint file')
        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
        offset = self.model.load(data, offset)
        offset = self.opt_ctx.load(data, offset)
        return offset
    def save_gguf(self, gguf_writer):
        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
        gguf_writer.add_layer_norm_rms_eps(1e-5)
        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
        self.model.save_gguf(gguf_writer)
        self.opt_ctx.save_gguf(gguf_writer)
 def handle_args():
    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
    return parser.parse_args()
 def main():
    cfg = handle_args()
    print(cfg)
    data = np.memmap(cfg.input, mode = 'r')
    chk = LoraCheckpoint(n_ff = cfg.ff)
    offset = 0
    offset = chk.load(data, offset)
    # we should have read all available data
    assert(offset == len(data))
    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
    chk.save_gguf(gguf_writer)
    print("    gguf: write header")
    gguf_writer.write_header_to_file()
    print("    gguf: write metadata")
    gguf_writer.write_kv_data_to_file()
    print("    gguf: write tensors")
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()
 if __name__ == '__main__':
    main()
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -947,7 +947,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.lora_adapter = argv[i];
+            params.lora_adapter.push_back({argv[i], 1.0f});
            params.use_mmap = false;
        }
        else if (arg == "--lora-scaled")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            const char * lora_adapter = argv[i];
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
            params.use_mmap = false;
        }
        else if (arg == "--lora-base")
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@ -10,9 +10,9 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 ./bin/train-text-from-scratch \
        --vocab-model ../models/ggml-vocab-llama.gguf \
        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16.gguf \
+        --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
-        --checkpoint-out chk-shakespeare-256x16.gguf \
+        --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
-        --model-out ggml-shakespeare-256x16-f32.gguf \
+        --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
        --train-data "shakespeare.txt" \
        -t 6 -b 16 --seed 1 --adam-iter 256 \
        --no-checkpointing
@ -20,3 +20,8 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 # predict
 ./bin/main -m ggml-shakespeare-256x16-f32.gguf
 ```
 Output files will be saved every N iterations (config with `--save-every N`).
 The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
 To train GGUF models just pass them to `--checkpoint-in FN`.
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@ -47,10 +47,13 @@ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-LLM_KV_TRAINING_FILE_VERSION    = "training.file_version"
+LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
+LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count"
+LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count"
+LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
 LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
 LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
 LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
 class Tensor:
    def __init__(self, dtype='f', ne=None):
@ -460,6 +463,7 @@ class Checkpoint:
        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
        gguf_writer.add_layer_norm_rms_eps(1e-5)
        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -77,7 +77,7 @@ struct free_block {
    size_t size;
 };
-#define MAX_FREE_BLOCKS 128
+#define MAX_FREE_BLOCKS 256
 struct ggml_allocr {
    void * data;
@ -187,6 +187,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    }
    tensor->data = addr;
    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
 #ifdef GGML_ALLOCATOR_DEBUG
    add_allocated_tensor(alloc, tensor);
@ -218,7 +219,8 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, tensor);
@ -631,3 +633,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
    return alloc->max_size;
 }
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -19,6 +19,7 @@ GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
 GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
 #ifdef  __cplusplus
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -214,8 +214,8 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
+#define GGML_MAX_NODES         16384
-#define GGML_MAX_PARAMS        256
+#define GGML_MAX_PARAMS        1024
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
@ -526,7 +526,15 @@ extern "C" {
    // next prime after GGML_MAX_NODES
    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
-    #define GGML_GRAPH_HASHTABLE_SIZE 8273
+    // #define GGML_GRAPH_HASHTABLE_SIZE 8273
    // #define GGML_GRAPH_HASHTABLE_SIZE 16411
    #define GGML_GRAPH_HASHTABLE_SIZE 32771
    enum ggml_cgraph_eval_order {
        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
        GGML_CGRAPH_EVAL_ORDER_COUNT
    };
    // computation graph
    struct ggml_cgraph {
@ -539,6 +547,8 @@ extern "C" {
        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
        enum ggml_cgraph_eval_order order;
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@ -686,12 +696,21 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
    // Converts a flat index into coordinates
    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
@ -725,6 +744,12 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_add_cast(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            enum   ggml_type      type);
    GGML_API struct ggml_tensor * ggml_add1(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -834,6 +859,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // sums repetitions in a into shape of b
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1689,6 +1715,16 @@ extern "C" {
    // dump the graph into a file using the dot format
    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
    // build gradient checkpointing backward graph gb for gf using provided checkpoints
    // gb_tmp will contain original backward graph with rewritten backward process nodes,
    // but without the second forward pass nodes.
    GGML_API void ggml_build_backward_gradient_checkpointing(
            struct ggml_context   * ctx,
            struct ggml_cgraph    * gf,
            struct ggml_cgraph    * gb,
            struct ggml_cgraph    * gb_tmp,
            struct ggml_tensor  * * checkpoints,
            int                     n_checkpoints);
    //
    // optimization
    //
@ -1723,7 +1759,7 @@ extern "C" {
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };
-    typedef void (*ggml_opt_callback)(void * data, float * sched);
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
    // optimization parameters
@ -1755,6 +1791,8 @@ extern "C" {
        bool print_forward_graph;
        bool print_backward_graph;
        int n_gradient_accumulation;
        // ADAM parameters
        struct {
            int n_iter;
@ -1800,6 +1838,7 @@ extern "C" {
        float loss_after;
        struct {
            struct ggml_tensor * g;  // current gradient
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
            struct ggml_tensor * pf; // past function values
@ -1916,26 +1955,26 @@ extern "C" {
    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
-    // results are undefined if the wrong type is used for the key
+    // will abort if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int i);
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int i);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int i);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int i);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
-    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
--- a/llama.cpp
+++ b/llama.cpp
@ -1936,20 +1936,18 @@ static void llm_load_vocab(
        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
    }
    const float * scores = nullptr;
    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
-    if (score_idx == -1) {
+    if (score_idx != -1) {
-        throw std::runtime_error("cannot find tokenizer scores in model file\n");
+        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
    }
-    const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+    const int * toktypes = nullptr;
    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
-    if (toktype_idx == -1) {
+    if (toktype_idx != -1) {
-        throw std::runtime_error("cannot find token type list in GGUF file\n");
+        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
    }
    const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
    // determine vocab type
    {
        std::string tokenizer_name;
@ -2017,8 +2015,8 @@ static void llm_load_vocab(
        auto & token_data = vocab.id_to_token[i];
        token_data.text  = std::move(word);
-        token_data.score = scores[i];
+        token_data.score = scores ? scores[i] : 0.0f;
-        token_data.type  = (llama_token_type) toktypes[i];
+        token_data.type  = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
    }
    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@ -6265,7 +6263,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }
 static int llama_apply_lora_from_file_internal(
-    const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
+    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
 ) {
    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
@ -6294,7 +6292,7 @@ static int llama_apply_lora_from_file_internal(
    int32_t lora_alpha;
    fin.read((char *) &lora_r, sizeof(lora_r));
    fin.read((char *) &lora_alpha, sizeof(lora_alpha));
-    float scaling = (float)lora_alpha / (float)lora_r;
+    float scaling = scale * (float)lora_alpha / (float)lora_r;
    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
@ -6510,9 +6508,10 @@ static int llama_apply_lora_from_file_internal(
                ggml_set_name(r, "r_cpy");
            }
-            struct ggml_cgraph gf = ggml_build_forward(r);
+            struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
            ggml_build_forward_expand(gf, r);
-            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
+            ggml_graph_compute_helper(work_buffer, gf, n_threads);
            // we won't need these tensors again, reset the context to save memory
            ggml_free(lora_ctx);
@ -6901,6 +6900,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
    return nparams;
 }
 struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
    return ggml_get_tensor(model->ctx, name);
 }
 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
@ -6914,18 +6917,18 @@ int llama_model_quantize(
    }
 }
-int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
+int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
    try {
-        return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
+        return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
        return 1;
    }
 }
-int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
+int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
    try {
-        return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
+        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
        return 1;
--- a/llama.h
+++ b/llama.h
@ -291,6 +291,9 @@ extern "C" {
    // Returns the total number of parameters in the model
    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
    // Get a llama model tensor
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
@ -306,15 +309,17 @@ extern "C" {
    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
                             int   n_threads),
            "use llama_model_apply_lora_from_file instead");
    LLAMA_API int llama_model_apply_lora_from_file(
            const struct llama_model * model,
-                          const char * path_lora,
+                      const char * path_lora,
-                          const char * path_base_model,
+                           float   scale,
-                                 int   n_threads);
+                      const char * path_base_model,
                             int   n_threads);
    //
    // KV cache
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -251,18 +251,20 @@ static bool check_gradient(
        printf("GGML_N_THREADS = %d\n", n_threads);
    }
-    struct ggml_cgraph gf = ggml_build_forward (f);
+    struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
-    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
+    struct ggml_cgraph * gb = ggml_new_graph(ctx0);
    *gb = *gf;
    ggml_build_backward_expand(ctx0, gf, gb, false);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-    ggml_graph_reset  (&gf);
+    ggml_graph_reset  (gf);
    ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-    // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
+    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
+    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
    for (int i = 0; i < nargs; ++i) {
        const int nelements = ggml_nelements(x[i]);
@ -273,13 +275,13 @@ static bool check_gradient(
            const float xp = x0 + eps;
            ggml_set_f32_1d(x[i], k, xp);
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
            const double f0 = ggml_get_f32_1d(f, 0);
            ggml_set_f32_1d(x[i], k, xm);
-            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
            const double f1 = ggml_get_f32_1d(f, 0);
            const double g0 = (f0 - f1)/(2.0*(double) eps);
@ -287,10 +289,10 @@ static bool check_gradient(
            ggml_set_f32_1d(x[i], k, x0);
            // compute gradient using backward graph
-            ggml_graph_reset  (&gf);
+            ggml_graph_reset  (gf);
            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
+            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
@ -373,7 +375,7 @@ static bool check_mat_mul(
 int main(int argc, const char ** argv) {
    struct ggml_init_params params = {
-        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_size   = */ 256*1024*1024,
        /* .mem_buffer = */ NULL,
        /* .no_alloc   = */ false,
    };
@ -405,6 +407,7 @@ int main(int argc, const char ** argv) {
        }
    }
    unsigned seed_iter = 1;
    // original loop: 1000
    int niter = 4;
@ -416,6 +419,10 @@ int main(int argc, const char ** argv) {
        niter = atoi(argv[1]);
    }
    for (int iter = 0; iter < niter; ++iter) {
        srand(seed_iter);
        seed_iter = rand();
        unsigned seed = rand();
        printf("test-grad0: iter:%d/%d\n", iter, niter);
        struct ggml_context * ctx0 = ggml_init(params);
@ -425,6 +432,7 @@ int main(int argc, const char ** argv) {
        // add f32
        {
            srand(seed);
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -441,6 +449,7 @@ int main(int argc, const char ** argv) {
        // add f16
        {
            srand(seed);
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -457,6 +466,7 @@ int main(int argc, const char ** argv) {
        // sub
        {
            srand(seed);
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -473,6 +483,7 @@ int main(int argc, const char ** argv) {
        // mul
        {
            srand(seed);
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -489,6 +500,7 @@ int main(int argc, const char ** argv) {
        // div
        {
            srand(seed);
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -505,6 +517,7 @@ int main(int argc, const char ** argv) {
        // sqr
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -521,6 +534,7 @@ int main(int argc, const char ** argv) {
        // sqrt
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -537,6 +551,7 @@ int main(int argc, const char ** argv) {
        // log
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -553,6 +568,7 @@ int main(int argc, const char ** argv) {
        // sum
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -570,6 +586,7 @@ int main(int argc, const char ** argv) {
        // sum_rows
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -587,6 +604,7 @@ int main(int argc, const char ** argv) {
        // mean, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -604,6 +622,7 @@ int main(int argc, const char ** argv) {
        // argmax
        if (0)
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -620,6 +639,7 @@ int main(int argc, const char ** argv) {
        // repeat
        {
            srand(seed);
            int64_t ne2[4];
            get_random_dims(ne2, 4);
@ -642,6 +662,7 @@ int main(int argc, const char ** argv) {
        // repeat back
        {
            srand(seed);
            int64_t ne2[4];
            get_random_dims(ne2, 4);
@ -680,6 +701,7 @@ int main(int argc, const char ** argv) {
        // sgn
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -696,6 +718,7 @@ int main(int argc, const char ** argv) {
        // neg
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -712,6 +735,7 @@ int main(int argc, const char ** argv) {
        // step
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -729,6 +753,7 @@ int main(int argc, const char ** argv) {
        // tanh, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -745,33 +770,45 @@ int main(int argc, const char ** argv) {
        // mul_mat
        {
            srand(seed);
            const int nargs = 2;
-            for (int ndims = 2; ndims <= 2; ++ndims) {
+            for (int ndims = 2; ndims <= 4; ++ndims) {
                int max_nrep = (ndims >= 3) ? 2 : 1;
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                {
+                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
-                    int64_t ne2[4];
+                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
-                    get_random_dims(ne2, 4);
+                        {
-                    ne2[0] = ne[0];
+                            int64_t ne2[4];
-                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
+                            get_random_dims(ne2, 4);
                            ne2[0] = ne[0];
                            ne2[2] = nrep2 * ne[2];
                            ne2[3] = nrep3 * ne[3];
                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                        }
                        ggml_set_param(ctx0, x[0]);
                        ggml_set_param(ctx0, x[1]);
                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
                        struct ggml_tensor * f = ggml_sum(ctx0, m);
                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
                        if (ndims == 2) {
                            // check_mat_mul does not support ndims > 2
                            check_mat_mul(m, x[1], x[0]);
                        }
                    }
                }
                ggml_set_param(ctx0, x[0]);
                ggml_set_param(ctx0, x[1]);
                struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
                struct ggml_tensor * f = ggml_sum(ctx0, m);
                GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
                check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
                check_mat_mul(m, x[1], x[0]);
            }
        }
        // elu, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -788,6 +825,7 @@ int main(int argc, const char ** argv) {
        // relu
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -805,6 +843,7 @@ int main(int argc, const char ** argv) {
        // gelu, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -821,6 +860,7 @@ int main(int argc, const char ** argv) {
        // silu
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -842,6 +882,7 @@ int main(int argc, const char ** argv) {
        // rms_norm
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -858,6 +899,7 @@ int main(int argc, const char ** argv) {
        // scale
        {
            srand(seed);
            const int nargs = 2;
            int64_t ne2[4];
@ -878,6 +920,7 @@ int main(int argc, const char ** argv) {
        // cpy f32
        {
            srand(seed);
            const int nargs = 2;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -895,6 +938,7 @@ int main(int argc, const char ** argv) {
        // cpy f16
        {
            srand(seed);
            const int nargs = 2;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -912,6 +956,7 @@ int main(int argc, const char ** argv) {
        // reshape (1d->nd)
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -935,6 +980,7 @@ int main(int argc, const char ** argv) {
        // reshape (nd->1d)
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
@ -958,6 +1004,7 @@ int main(int argc, const char ** argv) {
        // acc 1d
        {
            srand(seed);
            int64_t ne2[4] = { 1, 1, 1, 1 };
            const int nargs = 2;
@ -985,6 +1032,7 @@ int main(int argc, const char ** argv) {
        // acc 2d
        {
            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1017,6 +1065,7 @@ int main(int argc, const char ** argv) {
        // acc 3d
        {
            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1051,6 +1100,7 @@ int main(int argc, const char ** argv) {
        // acc 4d
        {
            srand(seed);
            int64_t ne2[4]         = { 1, 1, 1, 1 };
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1087,6 +1137,7 @@ int main(int argc, const char ** argv) {
        // set_1d
        {
            srand(seed);
            int64_t ne2[4];
            const int nargs = 2;
@ -1114,6 +1165,7 @@ int main(int argc, const char ** argv) {
        // set_2d
        {
            srand(seed);
            int64_t ne2[4];
            int64_t max_offsets[4] = { 0, 0, 0, 0 };
            int64_t offsets[4]     = { 0, 0, 0, 0 };
@ -1146,6 +1198,7 @@ int main(int argc, const char ** argv) {
        // view_1d
        {
            srand(seed);
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
@ -1169,6 +1222,7 @@ int main(int argc, const char ** argv) {
        // view_2d
        {
            srand(seed);
            int64_t ne2[4];
            int64_t nb2[4];
@ -1199,6 +1253,7 @@ int main(int argc, const char ** argv) {
        // view_3d
        {
            srand(seed);
            int64_t ne2[4] = {1,1,1,1};
            int64_t nb2[4] = {0,0,0,0};
@ -1230,6 +1285,7 @@ int main(int argc, const char ** argv) {
        // permute
        {
            srand(seed);
            int64_t ne2[4];
            const int nargs = 1;
@ -1263,6 +1319,7 @@ int main(int argc, const char ** argv) {
        // transpose
        {
            srand(seed);
            int64_t ne2[4];
            const int nargs = 1;
@ -1290,6 +1347,7 @@ int main(int argc, const char ** argv) {
        // get_rows
        {
            srand(seed);
            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
            const int nargs = 1;
@ -1306,6 +1364,7 @@ int main(int argc, const char ** argv) {
        // diag_mask_inf
        {
            srand(seed);
            const int nargs = 1;
            const int ndims = 2;
@ -1321,6 +1380,7 @@ int main(int argc, const char ** argv) {
        // diag_mask_zero
        {
            srand(seed);
            const int nargs = 1;
            const int ndims = 2;
@ -1336,6 +1396,7 @@ int main(int argc, const char ** argv) {
        // softmax
        {
            srand(seed);
            const int nargs = 1;
            int64_t ne2[4];
@ -1357,11 +1418,16 @@ int main(int argc, const char ** argv) {
                                                    ggml_new_f32(ctx0, eps))));
                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
                // this may result in different gradients too finite differences.
                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
                // if only the table lookup causes gradients to differ this is acceptable.
            }
        }
        // cross_entropy_loss
        {
            srand(seed);
            const int nargs = 1;
            int64_t ne2[4];
@ -1392,6 +1458,7 @@ int main(int argc, const char ** argv) {
        // rope f32
        {
            srand(seed);
            const int nargs = 1;
            int64_t ne2[4];
@ -1431,6 +1498,7 @@ int main(int argc, const char ** argv) {
        // rope f16
        {
            srand(seed);
            const int nargs = 1;
            int64_t ne2[4];
@ -1470,6 +1538,7 @@ int main(int argc, const char ** argv) {
        // flash_attn f32
        {
            srand(seed);
            const int nargs = 3;
            int64_t ne2[4];
@ -1482,28 +1551,31 @@ int main(int argc, const char ** argv) {
            for (int masked = 0; masked <= 1; ++masked) {
                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
+                    int max_nrep = (ndims >= 3) ? 2 : 1;
-                    int64_t nek[4] = { D, M, B, ne[3] };
+                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
-                    int64_t nev[4] = { M, D, B, ne[3] };
+                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
-                    if (ndims == 2) {
+                        int64_t nek[4] = { D, M, B, ne[3] };
-                        neq[2] = 1; neq[3] = 1;
+                        int64_t nev[4] = { M, D, B, ne[3] };
-                        nek[2] = 1; nek[3] = 1;
+                        if (ndims == 2) {
-                        nev[2] = 1; nev[3] = 1;
+                            neq[2] = 1; neq[3] = 1;
-                    } else if (ndims == 3) {
+                            nek[2] = 1; nek[3] = 1;
-                        neq[3] = 1;
+                            nev[2] = 1; nev[3] = 1;
-                        nek[3] = 1;
+                        } else if (ndims == 3) {
-                        nev[3] = 1;
+                            neq[3] = 1;
                            nek[3] = 1;
                            nev[3] = 1;
                        }
                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
                        ggml_set_param(ctx0, x[0]);
                        ggml_set_param(ctx0, x[1]);
                        ggml_set_param(ctx0, x[2]);
                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                    }
                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
                    ggml_set_param(ctx0, x[0]);
                    ggml_set_param(ctx0, x[1]);
                    ggml_set_param(ctx0, x[2]);
                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
                }
            }
        }
@ -1511,6 +1583,7 @@ int main(int argc, const char ** argv) {
        // flash_attn f16, not yet fully implemented
        if(0)
        {
            srand(seed);
            const int nargs = 3;
            int64_t ne2[4];