diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
index 2bf82bb58..79dba06a7 100644
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
         echo "GGML_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    echo "Building with static libs" && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
     cmake --build build --config Release --target llama-cli
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
index eb9aba618..f525658dd 100644
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
         echo "GGML_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
     fi && \
+    echo "Building with dynamic libs" && \
     cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
     cmake --build build --config Release --target llama-server
 
diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix
index 897fce4d3..0ecf19fc5 100644
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -10,7 +10,6 @@
             "llama-embedding"
             "llama-server"
             "llama-quantize"
-            "llama-train-text-from-scratch"
           ];
           mkApp = name: {
             type = "app";
diff --git a/.devops/tools.sh b/.devops/tools.sh
index cf0e8f32d..24dcfd350 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
     ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
     ./llama-cli "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./llama-finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -36,8 +34,6 @@ else
     echo "              ex: --outtype f16 \"/models/7B/\" "
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
-    echo "              See documentation for finetune for command-line parameters"
     echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
     echo "  --server (-s): Run a model on the server"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1f275603a..a1e183d11 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -860,7 +860,7 @@ jobs:
           mkdir build
           cd build
           cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
 
       - name: Determine tag name
         id: tag
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 48f9914af..b688f78ec 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,17 @@
-# Pull requests
+# Pull requests (for contributors)
 
-- Always squash-merge the PR before merging
-- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Test your changes:
   - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
   - Execute [the full CI locally on your machine](ci/README.md) before publishing
 - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
+  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
+- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+
+# Pull requests (for collaborators)
+
+- Squash-merge PRs
+- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
+- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
 
 # Coding guidelines
 
diff --git a/Makefile b/Makefile
index 4584594af..8d2ccddc4 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,6 @@ BUILD_TARGETS = \
 	llama-embedding \
 	llama-eval-callback \
 	llama-export-lora \
-	llama-finetune \
 	llama-gbnf-validator \
 	llama-gguf \
 	llama-gguf-hash \
@@ -37,7 +36,6 @@ BUILD_TARGETS = \
 	llama-simple \
 	llama-speculative \
 	llama-tokenize \
-	llama-train-text-from-scratch \
 	llama-vdot \
 	llama-cvector-generator \
 	tests/test-c.o
@@ -64,13 +62,13 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-spm
 
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
-	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
+	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
 
 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
 #  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
-LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
 
 # Deprecation aliases
 ifdef LLAMA_CUBLAS
@@ -876,6 +874,9 @@ OBJ_GGML += \
 
 OBJ_LLAMA = \
 	src/llama.o \
+	src/llama-vocab.o \
+	src/llama-grammar.o \
+	src/llama-sampling.o \
 	src/unicode.o \
 	src/unicode-data.o
 
@@ -1055,6 +1056,10 @@ src/unicode-data.o: \
 
 src/llama.o: \
 	src/llama.cpp \
+	src/llama-impl.h \
+	src/llama-vocab.h \
+	src/llama-grammar.h \
+	src/llama-sampling.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
@@ -1064,6 +1069,29 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+src/llama-vocab.o: \
+	src/llama-vocab.cpp \
+	src/llama-vocab.h \
+	src/llama-impl.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+src/llama-grammar.o: \
+	src/llama-grammar.cpp \
+	src/llama-grammar.h \
+	src/llama-impl.h \
+	src/llama-vocab.h \
+	src/llama-sampling.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+src/llama-sampling.o: \
+	src/llama-sampling.cpp \
+	src/llama-sampling.h \
+	src/llama-impl.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(LIB_LLAMA): \
 	$(OBJ_LLAMA) \
 	$(LIB_GGML)
@@ -1266,11 +1294,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
 	$(OBJ_GGML) $(OBJ_LLAMA)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1286,13 +1309,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-llama-finetune: examples/finetune/finetune.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 llama-export-lora: examples/export-lora/export-lora.cpp \
-	$(OBJ_GGML) common/log.h
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
@@ -1439,7 +1457,7 @@ run-benchmark-matmult: llama-benchmark-matmult
 .PHONY: run-benchmark-matmult swift
 
 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
-	$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
@@ -1548,7 +1566,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: main quantize perplexity embedding server finetune
+.PHONY: main quantize perplexity embedding server
 
 # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
 #  Eventually we will want to remove these target from building all the time.
@@ -1591,13 +1609,3 @@ ifneq (,$(wildcard embedding))
 	@echo "  Remove the 'embedding' binary to remove this warning."
 	@echo "#########"
 endif
-
-finetune: examples/deprecation-warning/deprecation-warning.cpp
-ifneq (,$(wildcard finetune))
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	@echo "#########"
-	@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
-	@echo "  Remove the 'finetune' binary to remove this warning."
-	@echo "#########"
-endif
diff --git a/Package.swift b/Package.swift
index d40a48385..1d90b47bf 100644
--- a/Package.swift
+++ b/Package.swift
@@ -4,6 +4,9 @@ import PackageDescription
 
 var sources = [
     "src/llama.cpp",
+    "src/llama-vocab.cpp",
+    "src/llama-grammar.cpp",
+    "src/llama-sampling.cpp",
     "src/unicode.cpp",
     "src/unicode-data.cpp",
     "ggml/src/ggml.c",
diff --git a/README.md b/README.md
index 7c233b5e1..d0ae2efb9 100644
--- a/README.md
+++ b/README.md
@@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well.
 
 Unless otherwise noted these projects are open-source with permissive licensing:
 
+- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
@@ -181,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
 
+**Games:**
+- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
+
 ## Demo
 
 <details>
diff --git a/common/common.cpp b/common/common.cpp
index dbb724fbb..ec44a0552 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
         return true;
     }
-    if (arg == "--lora-base") {
-        CHECK_ARG
-        params.lora_base = argv[i];
-        return true;
-    }
     if (arg == "--control-vector") {
         CHECK_ARG
         params.control_vectors.push_back({ 1.0f, argv[i], });
@@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         CHECK_ARG
         params.out_file = argv[i];
         params.cvector_outfile = argv[i];
+        params.lora_outfile = argv[i];
         return true;
     }
     if (arg == "-ofreq" || arg == "--output-frequency") {
@@ -1583,9 +1579,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
                                                                         "advanced option to override model metadata by key. may be specified multiple times.\n"
                                                                         "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
-    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
-    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
-    options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
+    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
     options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
                                                                         "note: this argument can be repeated to add multiple control vectors" });
     options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
@@ -1676,6 +1671,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
     options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });
 
+    options.push_back({ "export-lora" });
+    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
+    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
+    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
+    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
+
     printf("usage: %s [options]\n", argv[0]);
 
     for (const auto & o : options) {
@@ -2721,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
         const llama_chat_msg & new_msg,
         bool add_ass) {
     std::ostringstream ss;
-    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
+    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
     std::vector<llama_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -3166,7 +3168,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
         }
         fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
     }
-    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
     fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
diff --git a/common/common.h b/common/common.h
index 184a53dc0..8240ff99b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -128,7 +128,6 @@ struct gpt_params {
 
     // TODO: avoid tuple, use struct
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
-    std::string lora_base  = "";                              // base model path for the lora adapter
 
     std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
 
@@ -255,6 +254,8 @@ struct gpt_params {
     std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
 
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
+
+    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };
 
 void gpt_params_handle_hf_token(gpt_params & params);
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 6a483c815..079e40516 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
         llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
 
         // Apply grammar constraints to the single token
-        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
 
         // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
         bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
 
     // apply grammar checks before sampling logic
     if (apply_grammar && ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
+        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
     }
 
     return cur_p;
@@ -455,6 +455,6 @@ void llama_sampling_accept(
     ctx_sampling->prev.push_back(id);
 
     if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
+        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
     }
 }
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e3c8aac3d..4087187c1 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -48,7 +48,7 @@ class Model:
 
     dir_model: Path
     ftype: gguf.LlamaFileType
-    fname_out: Path | None
+    fname_out: Path
     is_big_endian: bool
     endianess: gguf.GGUFEndian
     use_temp_file: bool
@@ -62,11 +62,12 @@ class Model:
     gguf_writer: gguf.GGUFWriter
     model_name: str | None
     metadata_override: Path | None
+    dir_model_card: Path
 
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                  use_temp_file: bool = False, eager: bool = False,
                  metadata_override: Path | None = None, model_name: str | None = None,
                  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
@@ -90,6 +91,7 @@ class Model:
         self.tensor_names = None
         self.metadata_override = metadata_override
         self.model_name = model_name
+        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -237,6 +239,10 @@ class Model:
             self.gguf_writer.add_expert_used_count(n_experts_used)
             logger.info(f"gguf: experts used count = {n_experts_used}")
 
+        if (head_dim := self.hparams.get("head_dim")) is not None:
+            self.gguf_writer.add_key_length(head_dim)
+            self.gguf_writer.add_value_length(head_dim)
+
         self.gguf_writer.add_file_type(self.ftype)
         logger.info(f"gguf: file type = {self.ftype}")
 
@@ -345,7 +351,7 @@ class Model:
 
         total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
 
-        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
+        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
 
         # Fallback to model directory name if metadata name is still missing
         if self.metadata.name is None:
@@ -359,27 +365,22 @@ class Model:
         output_type: str = self.ftype.name.partition("_")[2]
 
         # Filename Output
-        # Note: `not is_dir()` is used because `.is_file()` will not detect
-        #       file template strings as it doesn't actually exist as a file
-        if self.fname_out is not None and not self.fname_out.is_dir():
-            # Output path is a custom defined templated filename
-
-            # Process templated file name with the output ftype, useful with the "auto" ftype
-            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
-        else:
+        if self.fname_out.is_dir():
             # Generate default filename based on model specification and available metadata
             if not vocab_only:
                 fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
             else:
                 fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
 
-            # Check if preferred output directory path was provided
-            if self.fname_out is not None and self.fname_out.is_dir():
-                # output path is a directory
-                self.fname_out = self.fname_out / f"{fname_default}.gguf"
-            else:
-                # output in the same directory as the model by default
-                self.fname_out = self.dir_model / f"{fname_default}.gguf"
+            # Use the default filename
+            self.fname_out = self.fname_out / f"{fname_default}.gguf"
+        else:
+            # Output path is a custom defined templated filename
+            # Note: `not is_dir()` is used because `.is_file()` will not detect
+            #       file template strings as it doesn't actually exist as a file
+
+            # Process templated file name with the output ftype, useful with the "auto" ftype
+            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
 
         self.set_type()
 
@@ -593,6 +594,15 @@ class Model:
         if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
             # ref: https://huggingface.co/core42/jais-13b
             res = "jais"
+        if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
+            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
+            res = "codeshell"
+        if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
+            # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
+            res = "tekken"
+        if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
+            # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
+            res = "smollm"
 
         if res is None:
             logger.warning("\n")
@@ -733,7 +743,7 @@ class Model:
                 added_tokens_json = json.load(f)
                 for key in added_tokens_json:
                     token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                         logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                         continue
 
@@ -750,7 +760,8 @@ class Model:
                     token_id = int(token_id)
                     token: str = token_data["content"]
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token.encode("utf-8")
+                        if tokens[token_id] != token.encode("utf-8"):
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
                     if token_data.get("special") or self.does_token_look_special(token):
                         toktypes[token_id] = SentencePieceTokenTypes.CONTROL
                     else:
@@ -1309,6 +1320,7 @@ class RefactModel(Model):
         special_vocab._set_special_token("prefix", 1)
         special_vocab._set_special_token("suffix", 3)
         special_vocab._set_special_token("middle", 2)
+        special_vocab.chat_template = None  # do not add it twice
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
@@ -1479,7 +1491,12 @@ class LlamaModel(Model):
         super().set_gguf_parameters()
         hparams = self.hparams
         self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
 
         if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
             if self.hparams["rope_scaling"].get("type") == "linear":
@@ -1994,7 +2011,7 @@ class Phi3MiniModel(Model):
 
                 for key in added_tokens_json:
                     token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                         logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                         continue
 
@@ -2011,7 +2028,8 @@ class Phi3MiniModel(Model):
                     token_id = int(token_id)
                     token = foken_data["content"].encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2027,7 +2045,8 @@ class Phi3MiniModel(Model):
                     token_id = int(foken_data["id"])
                     token = foken_data["content"].encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert tokens[token_id] == token
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2065,10 +2084,11 @@ class Phi3MiniModel(Model):
         self.gguf_writer.add_rope_dimension_count(rope_dims)
         self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
         self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
 
         # write rope scaling for long context (128k) model
         rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if (rope_scaling is None):
+        if rope_scaling is None:
             return
 
         scale = max_pos_embds / orig_max_pos_embds
@@ -2266,7 +2286,8 @@ class InternLM2Model(Model):
                         chat_eos_token_id = token_id
                     token = token.encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2285,7 +2306,8 @@ class InternLM2Model(Model):
                         chat_eos_token_id = token_id
                     token = token.encode("utf-8")
                     if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
-                        assert(tokens[token_id] == token)
+                        if tokens[token_id] != token:
+                            logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
                     tokens[token_id] = token
                     scores[token_id] = -1000.0
                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2471,6 +2493,7 @@ class GemmaModel(Model):
         special_vocab._set_special_token("middle", 68)
         special_vocab._set_special_token("fsep",   70)
         special_vocab._set_special_token("eot",    107)
+        special_vocab.chat_template = None  # do not add it twice
         special_vocab.add_to_gguf(self.gguf_writer)
 
         self.gguf_writer.add_add_space_prefix(False)
@@ -2712,7 +2735,7 @@ class JinaBertV2Model(BertModel):
 
             yield name, data
 
-    def set_vocab(self, *args, **kwargs):
+    def set_vocab(self):
         tokenizer_class = 'BertTokenizer'
         with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
             tokenizer_class = json.load(f)['tokenizer_class']
@@ -2860,7 +2883,7 @@ class ArcticModel(Model):
                     added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
                     for token_id, token_json in added_tokens_decoder.items():
                         token_id = int(token_id)
-                        if (token_id >= vocab_size):
+                        if token_id >= vocab_size:
                             logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                             continue
 
@@ -3109,7 +3132,7 @@ class T5Model(Model):
                 added_tokens_json = json.load(f)
                 for key in added_tokens_json:
                     token_id = added_tokens_json[key]
-                    if (token_id >= vocab_size):
+                    if token_id >= vocab_size:
                         logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
                         continue
 
@@ -3624,10 +3647,10 @@ def main() -> None:
         logger.error("Error: Cannot use temp file when splitting")
         sys.exit(1)
 
-    fname_out = None
-
     if args.outfile is not None:
         fname_out = args.outfile
+    else:
+        fname_out = dir_model
 
     logger.info(f"Loading model: {dir_model.name}")
 
@@ -3658,7 +3681,6 @@ def main() -> None:
         else:
             logger.info("Exporting model...")
             model_instance.write()
-            assert model_instance.fname_out is not None
             out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
             logger.info(f"Model successfully exported to {out_path}")
 
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index e4165ae2d..d5a2d925e 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
 
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 
 if len(sys.argv) == 2:
     token = sys.argv[1]
@@ -91,6 +91,9 @@ models = [
     {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
     {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
     {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
+    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
+    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
+    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
 ]
 
 
@@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
     response = sess.get(url, headers=headers)
     response.raise_for_status()
     os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
-        f.write(response.content)
+    with open(save_path, 'wb') as downloaded_file:
+        downloaded_file.write(response.content)
     logger.info(f"File {save_path} downloaded successfully")
 
 
@@ -159,7 +162,7 @@ for model in models:
         logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
         continue  # Skip to the next model if the tokenizer can't be loaded
 
-    chktok = tokenizer.encode(chktxt)
+    chktok = tokenizer.encode(CHK_TXT)
     chkhsh = sha256(str(chktok).encode()).hexdigest()
 
     logger.info(f"model: {name}")
@@ -191,7 +194,7 @@ src_func = f"""
         # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
         # use in llama.cpp to implement the same pre-tokenizer
 
-        chktxt = {repr(chktxt)}
+        chktxt = {repr(CHK_TXT)}
 
         chktok = tokenizer.encode(chktxt)
         chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -287,7 +290,7 @@ tests = [
     "333333333",
     "Cửa Việt", # llama-bpe fails on this
     " discards",
-    chktxt,
+    CHK_TXT,
 ]
 
 # write the tests to ./models/ggml-vocab-{name}.gguf.inp
diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py
index 95ea831a5..7b00b4398 100755
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -132,6 +132,10 @@ class Tensor:
 
 
 class GGMLModel:
+
+    file_format: GGMLFormat
+    format_version: int
+
     def __init__(self):
         self.hyperparameters = None
         self.vocab = None
@@ -290,7 +294,7 @@ class GGMLToGGUF:
         if self.vocab_override is not None:
             vo = self.vocab_override
             logger.info('* Adding vocab item(s)')
-            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                 tokens.append(vbytes)
                 scores.append(score)
                 toktypes.append(ttype)
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index 66e8da37c..a88d0d4a9 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -290,7 +290,7 @@ if __name__ == '__main__':
         fname_out = args.outfile
     else:
         # output in the same directory as the model by default
-        fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
+        fname_out = dir_lora
 
     if os.path.exists(input_model):
         # lazy import load_file only if lora is in safetensors format.
@@ -304,12 +304,6 @@ if __name__ == '__main__':
     # load base model
     logger.info(f"Loading base model: {dir_base_model.name}")
     hparams = Model.load_hparams(dir_base_model)
-
-    with open(lora_config, "r") as f:
-        lparams: dict[str, Any] = json.load(f)
-
-    alpha: float = lparams["lora_alpha"]
-
     with torch.inference_mode():
         try:
             model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -320,12 +314,21 @@ if __name__ == '__main__':
         class LoraModel(model_class):
             model_arch = model_class.model_arch
 
+            lora_alpha: float
+
+            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
+
+                super().__init__(*args, **kwargs)
+
+                self.dir_model_card = dir_lora_model
+                self.lora_alpha = float(lora_alpha)
+
             def set_type(self):
                 self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                 self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
 
             def set_gguf_parameters(self):
-                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
+                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                 super().set_gguf_parameters()
 
             def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
@@ -368,6 +371,11 @@ if __name__ == '__main__':
                     yield (dest_name + ".lora_a", lora_a)
                     yield (dest_name + ".lora_b", lora_b)
 
+        with open(lora_config, "r") as f:
+            lparams: dict[str, Any] = json.load(f)
+
+        alpha: float = lparams["lora_alpha"]
+
         model_instance = LoraModel(
             dir_base_model,
             ftype,
@@ -376,6 +384,8 @@ if __name__ == '__main__':
             use_temp_file=False,
             eager=args.no_lazy,
             dry_run=args.dry_run,
+            dir_lora_model=dir_lora,
+            lora_alpha=alpha,
         )
 
         logger.info("Exporting model...")
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 885983e92..d36ac0a15 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 ```sh
 ./build/bin/llama-ls-sycl-device
 ```
-A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
+
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```
 
-| Attribute              | Note                                                        |
-|------------------------|-------------------------------------------------------------|
-| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
-| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
 
 4. Launch inference
 
 There are two device selection modes:
 
 - Single device: Use one device target specified by the user.
-- Multiple devices: Automatically select the devices with the same largest Max compute-units.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
 
 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 build\bin\ls-sycl-device.exe
 ```
 
-The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 
 ```
 
-| Attribute              | Note                                                      |
-|------------------------|-----------------------------------------------------------|
-| compute capability 1.3 | Level-zero running time, recommended                      |
-| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
-
 
 4. Launch inference
 
 There are two device selection modes:
 
-- Single device: Use one device assigned by user.
-- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+- Single device: Use one device assigned by user. Default device id is 0.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
 
 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
diff --git a/docs/build.md b/docs/build.md
index 916fcf22d..d9d12c467 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
       make
       ```
 
-  - On Windows:
+  - On Windows (x86/x64 only, arm64 requires cmake):
 
     1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
     2. Extract `w64devkit` on your pc.
@@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options.
       cmake -B build -G "Xcode"
       cmake --build build --config Debug
       ```
+    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
+      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
+        - Tab Workload: Desktop-development with C++
+        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
+      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
+      - For Windows on ARM (arm64, WoA) build with:
+        ```bash
+        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
+        cmake --build build-arm64-windows-llvm-release
+        ```
+        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
 
 -   Using `gmake` (FreeBSD):
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 155743639..67b3d2774 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,7 +21,6 @@ else()
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
     add_subdirectory(export-lora)
-    add_subdirectory(finetune)
     add_subdirectory(gbnf-validator)
     add_subdirectory(gguf-hash)
     add_subdirectory(gguf-split)
@@ -53,5 +52,4 @@ else()
     add_subdirectory(simple)
     add_subdirectory(speculative)
     add_subdirectory(tokenize)
-    add_subdirectory(train-text-from-scratch)
 endif()
diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md
index 1e20feb4a..59918ec2b 100644
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names.
 | server | llama-server |
 | llama-bench | llama-bench |
 | embedding | llama-embedding |
-| finetune | llama-finetune |
 | quantize | llama-quantize |
 | tokenize | llama-tokenize |
 | export-lora | llama-export-lora |
@@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names.
 | save-load-state | llama-save-load-state |
 | simple | llama-simple |
 | speculative | llama-speculative |
-| train-text-from-scratch | llama-train-text-from-scratch |
 | vdot | llama-vdot |
 | tests/test-c.o | tests/test-c.o |
 
diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
index 1fb17feec..91c33c34a 100644
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
 usage: llama-export-lora [options]
 
 options:
-  -h, --help                         show this help message and exit
-  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
-  -o FNAME, --model-out FNAME        path to save exported model (default '')
-  -l FNAME, --lora FNAME             apply LoRA adapter
-  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
-  -t N, --threads N                  number of threads to use during computation (default: 4)
+  -m,    --model                  model path from which to load base model (default '')
+         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
+         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
+  -t,    --threads N              number of threads to use during computation (default: 4)
+  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
 ```
 
 For example:
@@ -20,7 +19,15 @@ For example:
 ./bin/llama-export-lora \
     -m open-llama-3b-v2-q8_0.gguf \
     -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
+    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
 ```
 
-Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
+Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
+
+```bash
+./bin/llama-export-lora \
+    -m your_base_model.gguf \
+    -o your_merged_model.gguf \
+    --lora-scaled lora_task_A.gguf 0.5 \
+    --lora-scaled lora_task_B.gguf 0.5
+```
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 16f27aa77..150f7e8d5 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,465 +1,420 @@
-
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 
+#include <map>
 #include <vector>
 #include <string>
 #include <thread>
+#include <fstream>
 
-struct lora_info {
-    std::string filename;
+static bool g_verbose = false;
+
+static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
+}
+
+static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
+    int id = gguf_find_key(ctx_gguf, key.c_str());
+    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ ctx_ggml,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
+    if (!ctx_gguf) {
+        throw std::runtime_error("failed to load input GGUF from " + fname);
+    }
+    return ctx_gguf;
+}
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+struct file_input {
+    struct ggml_context * ctx_meta = nullptr;
+    struct gguf_context * ctx_gguf = nullptr;
+    std::ifstream f_in;
+    std::map<std::string, ggml_tensor *> tensors;
+    float alpha;
     float scale;
+
+    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
+        if (!f_in.is_open()) {
+            throw std::runtime_error("failed to open input gguf from " + fname);
+        }
+
+        ctx_gguf = load_gguf(fname, &ctx_meta);
+        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
+        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
+
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
+            std::string name(cur->name);
+            tensors[name] = cur;
+            if (g_verbose) {
+                printf("%s: %s\n", __func__, cur->name);
+            }
+        }
+    }
+
+    ggml_tensor * get_tensor(std::string name) {
+        if (tensors.find(name) == tensors.end()) {
+            return nullptr;
+        }
+        return tensors[name];
+    }
+
+    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
+        if (tensors.find(name) == tensors.end()) {
+            throw std::runtime_error("cannot find tensor with name: " + name);
+        }
+        auto len = ggml_nbytes(tensors[name]);
+        if (buf.size() < len) {
+            buf.resize(len);
+        }
+        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
+        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
+        f_in.seekg(offset);
+        f_in.read((char* )buf.data(), len);
+    }
+
+    ~file_input() {
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+    }
 };
 
-struct export_lora_params {
-    std::string fn_model_base;
-    std::string fn_model_out;
-    std::vector<struct lora_info> lora;
+struct lora_merge_ctx {
+    // input base model + adapters
+    file_input base_model;
+    std::vector<std::unique_ptr<file_input>> adapters;
+
+    // for computing merged tensor
     int n_threads;
-};
+    ggml_backend_t backend = nullptr;
+    ggml_gallocr_t allocr = nullptr;
+    std::vector<uint8_t> read_buf;
 
-struct lora_data {
-    struct lora_info     info;
-    std::vector<uint8_t> data;
-    struct ggml_context * ctx;
+    // output file
+    struct gguf_context * ctx_out;
+    struct ggml_context * ctx_out_ggml;
+    std::ofstream fout;
 
-    uint32_t lora_r;
-    uint32_t lora_alpha;
-};
+    lora_merge_ctx(
+            std::string & base_fname,
+            std::vector<std::tuple<std::string, float>> & lora_files,
+            std::string & outfile,
+            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
+        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
+            throw std::runtime_error("split model is not yet supported");
+        }
 
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
+        for (auto lora_inp : lora_files) {
+            auto fname = std::get<0>(lora_inp);
+            auto scale = std::get<1>(lora_inp);
+            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
+            check_metadata_lora(adapter.get());
+            adapters.push_back(std::move(adapter));
+        }
+
+        ctx_out = gguf_init_empty();
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_out_ggml = ggml_init(params);
+        backend = ggml_backend_cpu_init();
+        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+
+    void check_metadata_lora(file_input * adapter) {
+        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
+        if (general_type != "adapter") {
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+
+        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
+        if (adapter_type != "lora") {
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+        }
+
+        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
+        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
+        if (general_arch_base != general_arch_lora) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+    }
+
+    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
+        if (t->type == GGML_TYPE_F32) {
+            return GGML_TYPE_F32;
         } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
+            return GGML_TYPE_F16;
         }
     }
 
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
+    void run_merge() {
+        // prepare metadata
+        gguf_set_kv(ctx_out, base_model.ctx_gguf);
+        // output is forced to f16 for now
+        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
 
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // check if all lora adapters have the same tensors
+        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
+        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
+        if (adapters.size() > 1) {
+            for (size_t i = 1; i < adapters.size(); ++i) {
+                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
+                    throw std::runtime_error(err_no_subset_adapter);
+                }
+                for (auto & it : adapters[i]->tensors) {
+                    if (adapters[0]->get_tensor(it.first) == nullptr) {
+                        throw std::runtime_error(err_no_subset_adapter);
+                    }
+                }
+            }
         }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
+
+        // mapping base tensor to out tensor (same shape with base, but different type)
+        // if out_tensor == nullptr, we only copy it
+        std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
+        for (auto & it : base_model.tensors) {
+            bool t_a = true;
+            bool t_b = true;
+            for (auto & adapter : adapters) {
+                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
+                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
+            }
+            auto base_tensor = it.second;
+            if (!t_a && !t_b) {
+                // only copy
+                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
+                ggml_set_name(cpy_tensor, base_tensor->name);
+                base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
+                gguf_add_tensor(ctx_out, cpy_tensor);
+            } else if (t_a && t_b) {
+                // need merging
+                struct ggml_tensor * out_tensor = ggml_new_tensor(
+                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
+                ggml_set_name(out_tensor, base_tensor->name);
+                base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
+                gguf_add_tensor(ctx_out, out_tensor);
+            } else {
+                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
+            }
         }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
+
+        // placeholder for the meta data
+        {
+            size_t meta_size = gguf_get_meta_size(ctx_out);
+            zeros(fout, meta_size);
         }
-    }
 
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
+        // process base model tensors
+        size_t n_merged = 0;
+        for (auto & it : base_to_out_tensors) {
+            if (it.second != nullptr) {
+                merge_tensor(it.first, it.second);
+                n_merged++;
+            } else {
+                copy_tensor(it.first);
+            }
         }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
+
+        // write output metadata
+        {
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+            gguf_get_meta_data(ctx_out, data.data());
+            fout.seekp(0);
+            fout.write((const char *)data.data(), data.size());
         }
+
+        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
+        printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
     }
 
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
+    void copy_tensor(struct ggml_tensor * base) {
+        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+        size_t len = ggml_nbytes(base);
+        base_model.read_tensor_data(base->name, read_buf);
+        fout.write((char* )read_buf.data(), len);
+        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
     }
 
-    bool eof() {
-        return tell() >= size;
-    }
+    void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
+        std::string name_base(base->name);
+        std::string name_lora_a = name_base + ".lora_a";
+        std::string name_lora_b = name_base + ".lora_b";
 
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
+        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
+
+        // context for input tensor
+        std::vector<struct ggml_tensor *> inp_a(adapters.size());
+        std::vector<struct ggml_tensor *> inp_b(adapters.size());
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        struct ggml_context * ctx = ggml_init(params);
+
+        // alloc tensors
+        struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            auto t_a = adapters[i]->get_tensor(name_lora_a);
+            auto t_b = adapters[i]->get_tensor(name_lora_b);
+            inp_a[i] = ggml_dup_tensor(ctx, t_a);
+            inp_b[i] = ggml_dup_tensor(ctx, t_b);
         }
+        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+        // load base tensor to backend buffer
+        base_model.read_tensor_data(name_base, read_buf);
+        if (base->type != GGML_TYPE_F32) {
+            // optionally dequantize it
+            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
+            auto nels = ggml_nelements(inp_base);
+            ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
+            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
+            qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
+            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
+        } else {
+            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
+        }
+
+        // load lora tensors to backend buffer
+        for (size_t i = 0; i < adapters.size(); ++i) {
+            adapters[i]->read_tensor_data(name_lora_a, read_buf);
+            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
+            adapters[i]->read_tensor_data(name_lora_b, read_buf);
+            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
+        }
+
+        // build graph
+        struct ggml_cgraph * gf;
+        {
+            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+            static std::vector<uint8_t> buf(buf_size);
+            struct ggml_init_params params0 = {
+                /*.mem_size   =*/ buf_size,
+                /*.mem_buffer =*/ buf.data(),
+                /*.no_alloc   =*/ true,
+            };
+            struct ggml_context * ctx0 = ggml_init(params0);
+            gf = ggml_new_graph(ctx0);
+            struct ggml_tensor * cur = inp_base;
+            for (size_t i = 0; i < adapters.size(); ++i) {
+                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
+                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
+                // scale
+                const float alpha = adapters[i]->alpha;
+                const float rank  = (float) inp_b[i]->ne[0];
+                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
+                delta = ggml_scale(ctx0, delta, scale);
+                cur = ggml_add(ctx0, delta, cur);
+                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
+                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
+            }
+            cur = ggml_cast(ctx0, cur, out->type);
+            printf("%s :   + output type is %s\n", __func__, ggml_type_name(out->type));
+            ggml_build_forward_expand(gf, cur);
+            ggml_free(ctx0);
+        }
+
+        // compute
+        {
+            ggml_gallocr_alloc_graph(allocr, gf);
+            ggml_backend_cpu_set_n_threads(backend, n_threads);
+            ggml_backend_graph_compute(backend, gf);
+        }
+
+        // write data to output file
+        {
+            auto result = gf->nodes[gf->n_nodes - 1];
+            size_t len = ggml_nbytes(result);
+            if (read_buf.size() < len) {
+                read_buf.resize(len);
+            }
+            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
+            fout.write((char* )read_buf.data(), len);
+            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+        }
+
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+    }
+
+    ~lora_merge_ctx() {
+        ggml_gallocr_free(allocr);
+        ggml_backend_free(backend);
+        gguf_free(ctx_out);
+        ggml_free(ctx_out_ggml);
     }
 };
 
-static struct export_lora_params get_default_export_lora_params() {
-    struct export_lora_params result;
-    result.fn_model_base = "";
-    result.fn_model_out  = "";
-    result.n_threads = GGML_DEFAULT_N_THREADS;
-    return result;
-}
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
 
-static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
-    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
-    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
-    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
-    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
-    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
-}
-
-static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct export_lora_params default_params = get_default_export_lora_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (arg == "-m" || arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "-o" || arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "-l" || arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            lora.scale = 1.0f;
-            params->lora.push_back(lora);
-        } else if (arg == "-s" || arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            lora.scale = std::stof(argv[i]);
-            params->lora.push_back(lora);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-            if (params->n_threads <= 0) {
-                params->n_threads = std::thread::hardware_concurrency();
-            }
-        } else if (arg == "-h" || arg == "--help") {
-            export_lora_print_usage(argc, argv, &default_params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
-            export_lora_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-
-    if (params->fn_model_base == default_params.fn_model_base) {
-        fprintf(stderr, "error: please specify a filename for model-base.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (params->fn_model_out == default_params.fn_model_out) {
-        fprintf(stderr, "error: please specify a filename for model-out.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    return true;
-}
-
-static void free_lora(struct lora_data * lora) {
-    if (lora->ctx != NULL) {
-        ggml_free(lora->ctx);
-    }
-    delete lora;
-}
-
-static struct lora_data * load_lora(struct lora_info * info) {
-    struct lora_data * result = new struct lora_data;
-    result->info = *info;
-    result->ctx = NULL;
-    result->lora_r     = 1;
-    result->lora_alpha = 1;
-
-    struct llama_file file(info->filename.c_str(), "rb");
-    if (file.fp == NULL) {
-        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
-            info->filename.c_str());
-        free_lora(result);
-        return NULL;
-    }
-
-    struct ggml_init_params params_ggml;
-    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
-    params_ggml.mem_buffer = NULL;
-    params_ggml.no_alloc   = true;
-    result->ctx = ggml_init(params_ggml);
-
-    uint32_t magic   = file.read_u32();
-    if (magic != LLAMA_FILE_MAGIC_GGLA) {
-        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
-    }
-    uint32_t version = file.read_u32();
-    if (version != 1) {
-        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
-    }
-    result->lora_r     = file.read_u32();
-    result->lora_alpha = file.read_u32();
-    // read tensor infos from file
-    std::vector<char> name_buf;
-    std::vector<struct ggml_tensor *> tensors;
-    std::vector<size_t> tensors_offset;
-    size_t total_nbytes_pad = 0;
-    while(!file.eof()) {
-        int64_t ne[4]   = {1,1,1,1};
-        uint32_t n_dims  = file.read_u32();
-        uint32_t namelen = file.read_u32();
-        uint32_t type    = file.read_u32();
-        for (uint32_t k = 0; k < n_dims; ++k) {
-            ne[k] = (int64_t)file.read_u32();
-        }
-        name_buf.clear();
-        name_buf.resize(namelen + 1, '\0');
-        file.read_raw(name_buf.data(), namelen);
-        file.seek((0-file.tell()) & 31, SEEK_CUR);
-        size_t offset = file.tell();
-        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
-        ggml_set_name(tensor, name_buf.data());
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        total_nbytes_pad += nbytes_pad;
-        tensors.push_back(tensor);
-        tensors_offset.push_back(offset);
-        file.seek(nbytes, SEEK_CUR);
-    }
-    // read tensor data
-    result->data.resize(total_nbytes_pad);
-    size_t data_offset = 0;
-    for (size_t i = 0; i < tensors.size(); ++i) {
-        struct ggml_tensor * tensor = tensors[i];
-        size_t offset     = tensors_offset[i];
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        file.seek(offset, SEEK_SET);
-        tensor->data = result->data.data() + data_offset;
-        file.read_raw(tensor->data, nbytes);
-        data_offset += nbytes_pad;
-    }
-    return result;
-}
-
-
-static struct ggml_cgraph * build_graph_lora(
-    struct ggml_context * ctx,
-    struct ggml_tensor * tensor,
-    struct ggml_tensor * lora_a,
-    struct ggml_tensor * lora_b,
-    float scaling
-) {
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
-    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, scaling);
-    }
-    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand (gf, res);
-    return gf;
-}
-
-static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
-    if (lora->ctx == NULL) {
-        return false;
-    }
-    std::string name = ggml_get_name(tensor);
-    std::string name_a = name + std::string(".loraA");
-    std::string name_b = name + std::string(".loraB");
-    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
-    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
-    if (lora_a == NULL || lora_b == NULL) {
-        return false;
-    }
-
-    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
-
-    struct ggml_init_params params;
-    params.mem_size   = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
-    params.mem_buffer = NULL;
-    params.no_alloc   = true;
-    struct ggml_context * ctx = NULL;
-    struct ggml_gallocr * alloc = NULL;
-    struct ggml_cgraph  * gf = NULL;
-
-    ctx   = ggml_init(params);
-    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
-
-    ggml_gallocr_alloc_graph(alloc, gf);
-
-    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> data_work;
-    data_work.resize(cplan.work_size);
-    cplan.work_data = data_work.data();
-
-    ggml_graph_compute(gf, &cplan);
-
-    ggml_gallocr_free(alloc);
-    ggml_free(ctx);
-    return true;
-}
-
-static void export_lora(struct export_lora_params * params) {
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < params->lora.size(); ++i) {
-        struct lora_data * lora = load_lora(&params->lora[i]);
-        if (lora != NULL) {
-            loras.push_back(lora);
-        }
-    }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-
-    // open input file
-    struct llama_file fin(params->fn_model_base.c_str(), "rb");
-    if (!fin.fp) {
-        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
-    }
-
-    // open base model gguf, read tensors without their data
-    struct ggml_context * ctx_in;
-    struct gguf_init_params params_gguf;
-    params_gguf.no_alloc = true;
-    params_gguf.ctx      = &ctx_in;
-    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
-
-    // create new gguf
-    struct gguf_context * gguf_out = gguf_init_empty();
-
-    // copy meta data from base model: kv and tensors
-    gguf_set_kv(gguf_out, gguf_in);
-    int n_tensors = gguf_get_n_tensors(gguf_in);
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-        gguf_add_tensor(gguf_out, tensor);
-    }
-
-    // create output file
-    struct llama_file fout(params->fn_model_out.c_str(), "wb");
-    if (!fout.fp) {
-        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
-    }
-
-    // write gguf meta data
-    std::vector<uint8_t> meta;
-    meta.resize(gguf_get_meta_size(gguf_out));
-    gguf_get_meta_data(gguf_out, meta.data());
-    fout.write_raw(meta.data(), meta.size());
-
-    std::vector<uint8_t> data;
-    std::vector<uint8_t> padding;
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-
-        // read tensor data
-        data.resize(ggml_nbytes(tensor));
-        tensor->data = data.data();
-        size_t offset = gguf_get_tensor_offset(gguf_in, i);
-        fin.seek(offset + meta.size(), SEEK_SET);
-        fin.read_raw(data.data(), data.size());
-
-        // apply all loras
-        for (size_t k = 0; k < loras.size(); ++k) {
-            apply_lora(tensor, loras[k], params->n_threads);
-        }
-
-        // write tensor data + padding
-        padding.clear();
-        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
-
-        GGML_ASSERT(fout.tell() == offset + meta.size());
-        // fout.seek(offset + meta.size(), SEEK_SET);
-        fout.write_raw(data.data(), data.size());
-        fout.write_raw(padding.data(), padding.size());
-
-        if (i % 2 == 0) {
-            printf(".");
-        }
-    }
+    printf("\nexample usage:\n");
+    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
+    printf("\nNOTE: output model is F16\n");
     printf("\n");
-
-    // close gguf
-    gguf_free(gguf_out);
-    gguf_free(gguf_in);
-
-    // free loras
-    for (size_t i = 0; i < loras.size(); ++i) {
-        free_lora(loras[i]);
-    }
 }
 
 int main(int argc, char ** argv) {
-    struct export_lora_params params = get_default_export_lora_params();
+    gpt_params params;
 
-    if (!export_lora_params_parse(argc, argv, &params)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
         return 1;
     }
 
-    export_lora(&params);
+    g_verbose = (params.verbosity == 1);
+    try {
+        lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
+        ctx.run_merge();
+    } catch (const std::exception & err) {
+        fprintf(stderr, "%s\n", err.what());
+        exit(EXIT_FAILURE);
+    }
+
+    printf("done, output file is %s\n", params.lora_outfile.c_str());
 
     return 0;
 }
diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt
deleted file mode 100644
index 64afe6ddc..000000000
--- a/examples/finetune/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-finetune)
-add_executable(${TARGET} finetune.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/finetune/README.md b/examples/finetune/README.md
deleted file mode 100644
index 1c27df053..000000000
--- a/examples/finetune/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# finetune
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# finetune LORA adapter
-./bin/llama-finetune \
-        --model-base open-llama-3b-v2-q8_0.gguf \
-        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
-        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
-        --train-data "shakespeare.txt" \
-        --save-every 10 \
-        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
-
-# predict
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-```
-
-**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
-The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
-So in above example after 10 iterations these files will be written:
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
-- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
-- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-After 10 more iterations:
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
-- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
-- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
-- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
-
-llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
-These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
-
-In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
-
-For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
-
-```bash
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
-  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
-  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
-```
-
-You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
-
-For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
-
-```bash
-./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
-  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
-```
-
-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
-
-Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
-If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
-
-The default LORA rank can be specified with `--lora-r N`.
-The LORA rank can be configured for each model tensor type separately with these command line options:
-
-```bash
-  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
-  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
-  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
-  --rank-out-norm N          LORA rank for output norm tensor (default 1)
-  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
-  --rank-out N               LORA rank for output tensor (default 4)
-  --rank-wq N                LORA rank for wq tensor (default 4)
-  --rank-wk N                LORA rank for wk tensor (default 4)
-  --rank-wv N                LORA rank for wv tensor (default 4)
-  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
-  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
-  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
-```
-
-The LORA rank of 'norm' tensors should always be 1.
-
-To see all available options use `llama-finetune --help`.
diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py
deleted file mode 100644
index 1b79d6995..000000000
--- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py
+++ /dev/null
@@ -1,487 +0,0 @@
-#!/usr/bin/env python3
-# finetune checkpoint --> gguf conversion
-
-import argparse
-import gguf
-import struct
-import numpy as np
-from pathlib import Path
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
-LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
-LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
-LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
-LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
-LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
-LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
-LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
-LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
-LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.prod(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        if tuple(ne) != tuple(self.ne):
-            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version != 1:
-            raise ValueError('Invalid version of optimization context in checkpoint file')
-
-        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-        self.adam_m  = Tensor('f', [self.nx])
-        self.adam_v  = Tensor('f', [self.nx])
-        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-        self.lbfgs_x    = Tensor('f', [self.nx])
-        self.lbfgs_xp   = Tensor('f', [self.nx])
-        self.lbfgs_g    = Tensor('f', [self.nx])
-        self.lbfgs_gp   = Tensor('f', [self.nx])
-        self.lbfgs_d    = Tensor('f', [self.nx])
-        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-        # forgot to save type in version 1:
-        # guess self.type from number of remaining bytes
-        size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                [self.adam_m, self.adam_v]
-                                +([self.adam_pf] if (self.past > 0) else [])])
-        size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                 self.lbfgs_lmal, self.lbfgs_lmys,
-                                 self.lbfgs_lms, self.lbfgs_lmy]
-                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
-        # due to alignment padding the size might not by exact
-        # but the difference in size for both types is significant,
-        # so we can just use whichever is closest
-        remaining = len(data) - offset
-        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-            self.type = 0
-        else:
-            self.type = 1
-
-        if self.type == 0:
-            offset = self.adam_m.load(data, offset)
-            offset = self.adam_v.load(data, offset)
-            offset = self.adam_pf.load(data,offset)
-
-            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        elif self.type == 1:
-            offset = self.lbfgs_x.load(data, offset)
-            offset = self.lbfgs_xp.load(data, offset)
-            offset = self.lbfgs_g.load(data, offset)
-            offset = self.lbfgs_gp.load(data, offset)
-            offset = self.lbfgs_d.load(data, offset)
-            offset = self.lbfgs_pf.load(data, offset)
-            offset = self.lbfgs_lmal.load(data, offset)
-            offset = self.lbfgs_lmys.load(data, offset)
-            offset = self.lbfgs_lms.load(data, offset)
-            offset = self.lbfgs_lmy.load(data, offset)
-
-            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError(f"Invalid optimizer type '{self.type}'")
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class LoraParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
-
-class ModelParams:
-    def __init__(self, n_ff = None):
-        self.n_ff = n_ff
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        if self.n_ff is None:
-            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-        else:
-            return self.n_ff
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None, suffix=".weight"):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
-
-class Layer:
-    def __init__(self, params, lora_params, bid):
-        self.bid = bid
-        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
-        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
-        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
-        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
-        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
-        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
-        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
-        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
-        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
-        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm_a.load(data, offset)
-        offset = self.att_norm_b.load(data, offset)
-        offset = self.wq_a.load(data, offset)
-        offset = self.wq_b.load(data, offset)
-        offset = self.wk_a.load(data, offset)
-        offset = self.wk_b.load(data, offset)
-        offset = self.wv_a.load(data, offset)
-        offset = self.wv_b.load(data, offset)
-        offset = self.wo_a.load(data, offset)
-        offset = self.wo_b.load(data, offset)
-        offset = self.ffn_norm_a.load(data, offset)
-        offset = self.ffn_norm_b.load(data, offset)
-        offset = self.w1_a.load(data, offset)
-        offset = self.w1_b.load(data, offset)
-        offset = self.w2_a.load(data, offset)
-        offset = self.w2_b.load(data, offset)
-        offset = self.w3_a.load(data, offset)
-        offset = self.w3_b.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
-        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
-        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
-        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
-        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
-        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
-        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
-        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
-        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
-        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
-        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
-        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
-        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
-        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
-        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
-        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
-        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
-        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
-
-class LoraModel:
-    def __init__(self, n_ff = None):
-        self.params = ModelParams(n_ff = n_ff)
-        self.lora_params = LoraParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-        offset = self.lora_params.load(data, offset)
-
-        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
-        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
-        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
-        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
-        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
-        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
-
-        offset = self.tok_embd_a.load(data, offset)
-        offset = self.tok_embd_b.load(data, offset)
-        offset = self.norm_a.load(data, offset)
-        offset = self.norm_b.load(data, offset)
-        offset = self.output_a.load(data, offset)
-        offset = self.output_b.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, self.lora_params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-        self.lora_params.save_gguf(gguf_writer)
-
-        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
-        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
-        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
-        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
-        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
-        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class LoraCheckpoint:
-    def __init__(self, n_ff = None):
-        self.model = LoraModel(n_ff = n_ff)
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcl':
-            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
-    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    print(cfg)
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = LoraCheckpoint(n_ff = cfg.ff)
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
deleted file mode 100644
index 71a4333ee..000000000
--- a/examples/finetune/finetune.cpp
+++ /dev/null
@@ -1,1862 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "llama.h"
-#include "common.h"
-#include "train.h"
-#include <vector>
-#include <cstring>
-#include <ctime>
-#include <algorithm>
-#include <string>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct my_llama_hparams {
-    uint32_t n_vocab    = 32000;
-    uint32_t n_ctx      = 512;
-    uint32_t n_embd     = 4096;
-    uint32_t n_ff       = 11008;
-    uint32_t n_head     = 32;
-    uint32_t n_head_kv  = 32;
-    uint32_t n_layer    = 32;
-
-    // float f_norm_eps     = 1e-5f; // falcon
-    float f_norm_rms_eps = 1e-5f; // llama
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-
-    uint32_t n_gqa() const {
-        return n_head/n_head_kv;
-    }
-
-    uint32_t n_embd_head() const {
-        return n_embd/n_head;
-    }
-
-    uint32_t n_embd_gqa() const {
-        return n_embd/n_gqa();
-    }
-
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(other));
-    }
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-};
-
-struct my_llama_model {
-    struct my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-};
-
-struct my_llama_lora_hparams {
-    uint32_t lora_r = 1;
-    uint32_t lora_alpha = 1;
-    uint32_t n_rank_attention_norm = 1;
-    uint32_t n_rank_wq = 4;
-    uint32_t n_rank_wk = 4;
-    uint32_t n_rank_wv = 4;
-    uint32_t n_rank_wo = 4;
-    uint32_t n_rank_ffn_norm = 1;
-    uint32_t n_rank_ffn_gate = 4;
-    uint32_t n_rank_ffn_down = 4;
-    uint32_t n_rank_ffn_up = 4;
-    uint32_t n_rank_tok_embeddings = 4;
-    uint32_t n_rank_norm = 1;
-    uint32_t n_rank_output = 4;
-
-    bool operator!=(const my_llama_lora_hparams& other) const {
-        return memcmp(this, &other, sizeof(other));
-    }
-};
-
-struct my_llama_lora_layer {
-    // normalization
-    struct ggml_tensor * attention_norm_a;
-    struct ggml_tensor * attention_norm_b;
-
-    // attention
-    struct ggml_tensor * wq_a;
-    struct ggml_tensor * wq_b;
-    struct ggml_tensor * wk_a;
-    struct ggml_tensor * wk_b;
-    struct ggml_tensor * wv_a;
-    struct ggml_tensor * wv_b;
-    struct ggml_tensor * wo_a;
-    struct ggml_tensor * wo_b;
-
-    // normalization
-    struct ggml_tensor * ffn_norm_a;
-    struct ggml_tensor * ffn_norm_b;
-
-    // ff
-    struct ggml_tensor * ffn_gate_a;
-    struct ggml_tensor * ffn_gate_b;
-    struct ggml_tensor * ffn_down_a;
-    struct ggml_tensor * ffn_down_b;
-    struct ggml_tensor * ffn_up_a;
-    struct ggml_tensor * ffn_up_b;
-};
-
-struct my_llama_lora {
-    struct ggml_context * ctx = NULL;
-    ggml_backend_buffer_t data;
-
-    my_llama_lora_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings_a;
-    struct ggml_tensor * tok_embeddings_b;
-
-    struct ggml_tensor * norm_a;
-    struct ggml_tensor * norm_b;
-    struct ggml_tensor * output_a;
-    struct ggml_tensor * output_b;
-
-    std::vector<my_llama_lora_layer> layers;
-};
-
-// gguf constants
-static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA   = "finetune_lora";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
-
-static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd";
-static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v";
-static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down";
-static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up";
-
-// gguf constants (sync with gguf.py)
-
-static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV     = "%s.attention.head_count_kv";
-static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-static const char * LLM_TENSOR_OUTPUT        = "output";
-static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
-    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
-    printf("%s: n_head                : %u\n", __func__, params->n_head);
-    printf("%s: n_head_kv             : %u\n", __func__, params->n_head_kv);
-    printf("%s: n_layer               : %u\n", __func__, params->n_layer);
-    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
-    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
-    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
-}
-
-static void print_lora_params(struct my_llama_lora_hparams * params) {
-    printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm);
-    printf("%s: n_rank_wq             : %u\n", __func__, params->n_rank_wq);
-    printf("%s: n_rank_wk             : %u\n", __func__, params->n_rank_wk);
-    printf("%s: n_rank_wv             : %u\n", __func__, params->n_rank_wv);
-    printf("%s: n_rank_wo             : %u\n", __func__, params->n_rank_wo);
-    printf("%s: n_rank_ffn_norm       : %u\n", __func__, params->n_rank_ffn_norm);
-    printf("%s: n_rank_ffn_gate       : %u\n", __func__, params->n_rank_ffn_gate);
-    printf("%s: n_rank_ffn_down       : %u\n", __func__, params->n_rank_ffn_down);
-    printf("%s: n_rank_ffn_up         : %u\n", __func__, params->n_rank_ffn_up);
-    printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
-    printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
-    printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
-}
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-{ \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-}
-
-static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) {
-    std::string arch;
-
-    GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    if (expected_arch != NULL) {
-        if (arch != expected_arch) {
-            printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch);
-        }
-        GGML_ASSERT(arch == expected_arch);
-    }
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [&arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
-        return keybuf.data();
-    };
-
-    GGUF_GET_KEY(ctx, hparams->n_embd,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ctx,          gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ff,           gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_head,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(ctx, hparams->n_layer,        gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_BLOCK_COUNT));
-
-    // n_head_kv is optional, default to n_head
-    hparams->n_head_kv = hparams->n_head;
-    GGUF_GET_KEY(ctx, hparams->n_head_kv,      gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(ctx, rope_freq_scale,         gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        hparams->rope_freq_scale = 1.0f / rope_freq_scale;
-    }
-}
-
-static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
-    auto & hparams = model->hparams;
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-
-    // get parameters directly from gguf file
-    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
-        struct gguf_context * mctx = gguf_init_from_file(fn_model, params);
-
-        load_model_hparams_gguf(mctx, &hparams, "llama");
-
-        gguf_free(mctx);
-    }
-    hparams.n_vocab = llama_n_vocab(input);
-    hparams.n_ctx = n_ctx;
-
-    // get tensors from llama_model (possibly mmapped)
-    model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
-    model->norm           = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM));
-    model->output         = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT));
-
-    assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab);
-    assert_shape_1d(model->norm,           hparams.n_embd);
-    assert_shape_2d(model->output,         hparams.n_embd, hparams.n_vocab);
-
-    model->layers.resize(hparams.n_layer);
-    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i));
-        layer.wq             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i));
-        layer.wk             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i));
-        layer.wv             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
-        layer.wo             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
-        layer.ffn_norm       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
-        layer.ffn_gate       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
-        layer.ffn_down       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
-        layer.ffn_up         = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
-
-        assert_shape_1d(layer.attention_norm, hparams.n_embd);
-        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
-        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd_gqa());
-        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
-        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
-        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
-        assert_shape_2d(layer.ffn_gate,       hparams.n_embd, hparams.n_ff);
-        assert_shape_2d(layer.ffn_down,       hparams.n_ff,   hparams.n_embd);
-        assert_shape_2d(layer.ffn_up,         hparams.n_embd, hparams.n_ff);
-    }
-}
-
-static void set_param_lora(struct my_llama_lora * lora) {
-    const uint32_t n_layer = lora->layers.size();
-
-    struct ggml_context* ctx = lora->ctx;
-
-    ggml_set_param(ctx, lora->tok_embeddings_a);
-    ggml_set_param(ctx, lora->tok_embeddings_b);
-    ggml_set_param(ctx, lora->norm_a);
-    ggml_set_param(ctx, lora->norm_b);
-    ggml_set_param(ctx, lora->output_a);
-    ggml_set_param(ctx, lora->output_b);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm_a);
-        ggml_set_param(ctx, layer.attention_norm_b);
-        ggml_set_param(ctx, layer.wq_a);
-        ggml_set_param(ctx, layer.wq_b);
-        ggml_set_param(ctx, layer.wk_a);
-        ggml_set_param(ctx, layer.wk_b);
-        ggml_set_param(ctx, layer.wv_a);
-        ggml_set_param(ctx, layer.wv_b);
-        ggml_set_param(ctx, layer.wo_a);
-        ggml_set_param(ctx, layer.wo_b);
-        ggml_set_param(ctx, layer.ffn_norm_a);
-        ggml_set_param(ctx, layer.ffn_norm_b);
-        ggml_set_param(ctx, layer.ffn_gate_a);
-        ggml_set_param(ctx, layer.ffn_gate_b);
-        ggml_set_param(ctx, layer.ffn_down_a);
-        ggml_set_param(ctx, layer.ffn_down_b);
-        ggml_set_param(ctx, layer.ffn_up_a);
-        ggml_set_param(ctx, layer.ffn_up_b);
-    }
-}
-
-static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) {
-    const auto & lparams = lora->hparams;
-
-    const uint32_t n_embd     = model->hparams.n_embd;
-    const uint32_t n_embd_gqa = model->hparams.n_embd_gqa();
-    const uint32_t n_layer    = model->hparams.n_layer;
-    const uint32_t n_vocab    = model->hparams.n_vocab;
-    const uint32_t n_ff       = model->hparams.n_ff;
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
-        return tn_buf.data();
-    };
-
-    // context for lora tensors without their data
-    struct ggml_init_params ctx_lora_params;
-    ctx_lora_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
-    ctx_lora_params.mem_buffer = NULL;
-    ctx_lora_params.no_alloc   = true;
-
-    struct ggml_context * ctx = ggml_init(ctx_lora_params);
-    lora->ctx = ctx;
-
-    lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd);
-    lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab);
-    lora->norm_a           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd);
-    lora->norm_b           = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1);
-    lora->output_a         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd);
-    lora->output_b         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab);
-
-    ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_a"));
-    ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.lora_b"));
-    ggml_set_name(lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a"));
-    ggml_set_name(lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b"));
-    ggml_set_name(lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_a"));
-    ggml_set_name(lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.lora_b"));
-
-    lora->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-
-        layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd);
-        layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1);
-
-        layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
-        layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd);
-        layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd);
-        layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa);
-        layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd);
-        layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa);
-        layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
-        layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd);
-
-        layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
-        layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
-
-        layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
-        layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
-        layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
-        layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
-        layer.ffn_up_a   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_embd);
-        layer.ffn_up_b   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_ff);
-
-        ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
-        ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
-        ggml_set_name(layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_a", i));
-        ggml_set_name(layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    ".weight.lora_b", i));
-        ggml_set_name(layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_a", i));
-        ggml_set_name(layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    ".weight.lora_b", i));
-        ggml_set_name(layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_a", i));
-        ggml_set_name(layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    ".weight.lora_b", i));
-        ggml_set_name(layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_a", i));
-        ggml_set_name(layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
-        ggml_set_name(layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
-        ggml_set_name(layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
-    }
-
-    set_param_lora(lora);
-
-    // allocate data for lora tensors
-    lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-}
-
-static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
-    const uint32_t n_layer = lora->layers.size();
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(lora->tok_embeddings_a, rnd);
-    ggml_set_zero(lora->tok_embeddings_b);
-    randomize_tensor_normal(lora->norm_a,           rnd);
-    ggml_set_zero(lora->norm_b);
-    randomize_tensor_normal(lora->output_a,         rnd);
-    ggml_set_zero(lora->output_b);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = lora->layers[i];
-        randomize_tensor_normal(layer.attention_norm_a, rnd);
-        ggml_set_zero(layer.attention_norm_b);
-
-        randomize_tensor_normal(layer.wq_a, rnd);
-        ggml_set_zero(layer.wq_b);
-        randomize_tensor_normal(layer.wk_a, rnd);
-        ggml_set_zero(layer.wk_b);
-        randomize_tensor_normal(layer.wv_a, rnd);
-        ggml_set_zero(layer.wv_b);
-        randomize_tensor_normal(layer.wo_a, rnd);
-        ggml_set_zero(layer.wo_b);
-
-        randomize_tensor_normal(layer.ffn_norm_a, rnd);
-        ggml_set_zero(layer.ffn_norm_b);
-
-        randomize_tensor_normal(layer.ffn_gate_a, rnd);
-        ggml_set_zero(layer.ffn_gate_b);
-        randomize_tensor_normal(layer.ffn_down_a, rnd);
-        ggml_set_zero(layer.ffn_down_b);
-        randomize_tensor_normal(layer.ffn_up_a, rnd);
-        ggml_set_zero(layer.ffn_up_b);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static struct ggml_tensor * llama_build_lora_finetune_graphs(
-        struct my_llama_model * model,
-        struct my_llama_lora  * lora,
-        ggml_gallocr_t          alloc,
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        const  int              n_tokens,
-        const  int              n_batch,
-        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing,
-        const  bool             measure_only) {
-
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
-    const int n_past = 0;
-    const int N = n_tokens;
-    const auto & hparams  = model->hparams;
-    const int n_ctx       = hparams.n_ctx;
-    const int n_vocab     = hparams.n_vocab;
-    const int n_embd      = hparams.n_embd;
-    const int n_layer     = hparams.n_layer;
-    const int n_head      = hparams.n_head;
-    const int n_head_kv   = hparams.n_head_kv;
-    const int n_ff        = hparams.n_ff;
-    const int n_rot       = hparams.n_embd_head();
-    const int n_embd_head = hparams.n_embd_head();
-    const int n_embd_gqa  = hparams.n_embd_gqa();
-
-    const float rms_norm_eps    = hparams.f_norm_rms_eps;
-    const float rope_freq_base  = hparams.rope_freq_base;
-    const float rope_freq_scale = hparams.rope_freq_scale;
-
-    GGML_ASSERT((size_t) n_layer == lora->layers.size());
-
-    auto set_name = [](struct ggml_tensor * t, const char * n) {
-        ggml_set_name(t, n);
-        if (t->grad) {
-            ggml_format_name(t->grad, "%s->grad", n);
-        }
-    };
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_set_input(KQ_pos);
-
-    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
-                (struct ggml_tensor * t) -> struct ggml_tensor * {
-        // not capturing these, to silcence warnings
-        const int rope_mode = 0;
-
-        return ggml_rope_ext(ctx,
-            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
-            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-        );
-    };
-
-    set_name(tokens_input, "tokens_input");
-    set_name(targets,      "targets");
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-
-    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
-            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
-        } else if (a->type == GGML_TYPE_F32) {
-            return ggml_add(ctx, a, b);
-        } else {
-            die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n",
-                __func__, ggml_type_name(a->type));
-        }
-    };
-
-    struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b));
-    struct ggml_tensor * norm           = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b));
-    struct ggml_tensor * output         = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b));
-
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00);        set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    struct ggml_tensor * cur = t01;
-
-    std::vector<struct ggml_tensor *> checkpoints;
-    if (enable_checkpointing) {
-        checkpoints.push_back(tokens_input);
-        checkpoints.push_back(targets);
-        checkpoints.push_back(t00);
-        checkpoints.push_back(t01);
-    }
-
-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        struct my_llama_lora_layer & llayer = lora->layers[il];
-
-        struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
-        struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
-        struct ggml_tensor * wq       = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
-        struct ggml_tensor * wk       = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
-        struct ggml_tensor * wv       = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
-        struct ggml_tensor * wo       = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
-        struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
-        struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
-        struct ggml_tensor * ffn_up   = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
-
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                       set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                     set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                                set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, wq, t04);                                 set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd_head, n_head, N, n_batch);    set_name(t06, "t06");     assert_shape_4d(t06, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = rope              (t06);                                          set_name(t07, "t07");     assert_shape_4d(t07, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, wk, t04);                                 set_name(t08, "t08");     assert_shape_2d(t08, n_embd_gqa, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch);
-        struct ggml_tensor * t10 = rope              (t09);                                          set_name(t10, "t10");     assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch);
-
-        struct ggml_tensor * t11;
-        if (ggml_is_quantized(wv->type)) {
-            struct ggml_tensor * t11_1 = ggml_mul_mat  (ctx, wv, t04);                               set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch);
-            struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1);                                 set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa);
-                                 t11   = ggml_cont     (ctx, t11_2);                                 set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
-        } else {
-                                 t11   = ggml_mul_mat  (ctx, t04, wv);                               set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd_gqa);
-        }
-
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                         set_name(t13, "t13");     assert_shape_4d(t13, n_embd_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                         set_name(t14, "t14");     assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
-        struct ggml_tensor * t16;
-        if (enable_flash_attn) {
-            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
-            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
-        } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);             set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                     set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                     set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
-        }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                         set_name(t17, "t17");     assert_shape_4d(t17, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                     set_name(t18, "t18");     assert_shape_4d(t18, n_embd_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                  set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, wo, t19);                                 set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                                set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                       set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                           set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                                set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, ffn_up, t24);                             set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, ffn_gate, t24);                           set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                     set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                                set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, ffn_down, t28);                           set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                                set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
-        cur = t30;
-        if (enable_checkpointing) {
-            checkpoints.push_back(cur);
-        }
-    }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, rms_norm_eps);                    set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, norm, t31);                            set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                             set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, output, t33);                          set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);             set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                         set_name(t36, "t36");     assert_shape_1d(t36, 1);
-
-    if (enable_checkpointing) {
-        checkpoints.push_back(t31);
-        checkpoints.push_back(t32);
-        checkpoints.push_back(t33);
-        checkpoints.push_back(t34);
-        checkpoints.push_back(t35);
-        checkpoints.push_back(t36);
-    }
-
-    ggml_build_forward_expand(gf, t36);
-
-    if (enable_checkpointing) {
-        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
-    } else {
-        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, true);
-    }
-
-    GGML_ASSERT(alloc != NULL);
-
-    // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
-    int n_leafs_before = gb->n_leafs;
-    int n_nodes_before = gb->n_nodes;
-
-    // output tensors
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
-    // input gradient
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
-    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-    ggml_set_input(t36->grad);
-    // KQ_pos
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
-
-    // make sure base model tensors data cannot be used in viewable operations
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
-    }
-
-    // allocating checkpoints in one block to reduce memory fragmentation
-    // note: they will be freed in reverse order
-    for (unsigned int i = 0; i < checkpoints.size(); ++i) {
-        if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-            ggml_set_input(checkpoints[i]);
-        }
-    }
-
-    if (measure_only) {
-        ggml_gallocr_reserve(alloc, gb);
-    } else {
-        ggml_gallocr_alloc_graph(alloc, gb);
-
-        // set KQ_pos
-        {
-            int * data = (int *) KQ_pos->data;
-            for (int i = 0; i < N; ++i) {
-                data[i] = n_past + i;
-            }
-        }
-    }
-
-    // remove the additional nodes and leafs
-    for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
-        gb->leafs[i] = NULL;
-    }
-    for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
-        gb->nodes[i] = NULL;
-    }
-    gb->n_leafs = n_leafs_before;
-    gb->n_nodes = n_nodes_before;
-
-    *logits = t35;
-    return t36;
-}
-
-static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-
-    std::string arch;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-
-    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    GGML_ASSERT(arch == "llama");
-
-    uint32_t ftype_u;
-    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
-    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
-
-    struct my_llama_hparams hparams;
-    load_model_hparams_gguf(fctx, &hparams, arch.c_str());
-
-    // parameters that define tensor shapes must match
-    GGML_ASSERT(hparams.n_embd    == model->hparams.n_embd);
-    GGML_ASSERT(hparams.n_ff      == model->hparams.n_ff);
-    GGML_ASSERT(hparams.n_head    == model->hparams.n_head);
-    GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv);
-    GGML_ASSERT(hparams.n_layer   == model->hparams.n_layer);
-
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm,           gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_output,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
-
-    init_lora(model, lora);
-
-    copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a));
-    copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b));
-    copy_tensor_by_name(lora->norm_a,           f_ggml_ctx, ggml_get_name(lora->norm_a));
-    copy_tensor_by_name(lora->norm_b,           f_ggml_ctx, ggml_get_name(lora->norm_b));
-    copy_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
-    copy_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
-        copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
-        copy_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
-        copy_tensor_by_name(layer.wq_b,             f_ggml_ctx, ggml_get_name(layer.wq_b));
-        copy_tensor_by_name(layer.wk_a,             f_ggml_ctx, ggml_get_name(layer.wk_a));
-        copy_tensor_by_name(layer.wk_b,             f_ggml_ctx, ggml_get_name(layer.wk_b));
-        copy_tensor_by_name(layer.wv_a,             f_ggml_ctx, ggml_get_name(layer.wv_a));
-        copy_tensor_by_name(layer.wv_b,             f_ggml_ctx, ggml_get_name(layer.wv_b));
-        copy_tensor_by_name(layer.wo_a,             f_ggml_ctx, ggml_get_name(layer.wo_a));
-        copy_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
-        copy_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
-        copy_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
-        copy_tensor_by_name(layer.ffn_gate_a,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
-        copy_tensor_by_name(layer.ffn_gate_b,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
-        copy_tensor_by_name(layer.ffn_down_a,       f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
-        copy_tensor_by_name(layer.ffn_down_b,       f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
-        copy_tensor_by_name(layer.ffn_up_a,         f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
-        copy_tensor_by_name(layer.ffn_up_b,         f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
-    }
-}
-
-static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) {
-    const char * arch = "llama";
-    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch);
-        return keybuf.data();
-    };
-
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
-    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
-
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx);
-    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
-    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV),     model->hparams.n_head_kv);
-    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer);
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_embd_head());
-    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base);
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           model->hparams.rope_freq_scale);
-
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,   lora->hparams.n_rank_tok_embeddings);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM,  lora->hparams.n_rank_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT,       lora->hparams.n_rank_output);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,    lora->hparams.n_rank_attention_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q,       lora->hparams.n_rank_wq);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K,       lora->hparams.n_rank_wk);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V,       lora->hparams.n_rank_wv);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,     lora->hparams.n_rank_wo);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM,     lora->hparams.n_rank_ffn_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_ffn_gate);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_ffn_down);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_ffn_up);
-
-    gguf_add_tensor(fctx, lora->tok_embeddings_a);
-    gguf_add_tensor(fctx, lora->tok_embeddings_b);
-    gguf_add_tensor(fctx, lora->norm_a);
-    gguf_add_tensor(fctx, lora->norm_b);
-    gguf_add_tensor(fctx, lora->output_a);
-    gguf_add_tensor(fctx, lora->output_b);
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-
-        gguf_add_tensor(fctx, layer.attention_norm_a);
-        gguf_add_tensor(fctx, layer.attention_norm_b);
-        gguf_add_tensor(fctx, layer.wq_a);
-        gguf_add_tensor(fctx, layer.wq_b);
-        gguf_add_tensor(fctx, layer.wk_a);
-        gguf_add_tensor(fctx, layer.wk_b);
-        gguf_add_tensor(fctx, layer.wv_a);
-        gguf_add_tensor(fctx, layer.wv_b);
-        gguf_add_tensor(fctx, layer.wo_a);
-        gguf_add_tensor(fctx, layer.wo_b);
-        gguf_add_tensor(fctx, layer.ffn_norm_a);
-        gguf_add_tensor(fctx, layer.ffn_norm_b);
-        gguf_add_tensor(fctx, layer.ffn_gate_a);
-        gguf_add_tensor(fctx, layer.ffn_gate_b);
-        gguf_add_tensor(fctx, layer.ffn_down_a);
-        gguf_add_tensor(fctx, layer.ffn_down_b);
-        gguf_add_tensor(fctx, layer.ffn_up_a);
-        gguf_add_tensor(fctx, layer.ffn_up_b);
-    }
-}
-
-static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA;
-    GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-    GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-
-    load_train_state_gguf(fctx, f_ggml_ctx, train);
-    load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora);
-}
-
-static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA);
-    save_llama_lora_gguf(fctx, model, lora);
-    save_train_state_gguf(fctx, train);
-}
-
-static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
-    struct gguf_context * fctx = gguf_init_from_file(filename, params);
-    if (fctx == NULL) {
-        return false;
-    }
-
-    load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train);
-
-    gguf_free(fctx);
-    return true;
-}
-
-static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_checkpoint_lora_gguf(fctx, model, lora, train);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) {
-    if (tensor == NULL) {
-        file->write_u32(0);
-        file->write_u32(0);
-        file->write_u32(GGML_TYPE_F32);
-        file->seek((0-file->tell()) & 31, SEEK_CUR);
-        return;
-    }
-    if (name == NULL) {
-        name = ggml_get_name(tensor);
-    }
-    uint32_t name_len = strlen(name);
-    uint32_t nd = ggml_n_dims(tensor);
-    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
-                       (uint32_t)tensor->ne[1],
-                       (uint32_t)tensor->ne[2],
-                       (uint32_t)tensor->ne[3] };
-    file->write_u32(nd);
-    file->write_u32(name_len);
-    file->write_u32(tensor->type);
-    file->write_raw(ne, sizeof(ne[0]) * nd);
-    file->write_raw(name, name_len);
-    file->seek((0-file->tell()) & 31, SEEK_CUR);
-    file->write_raw(tensor->data, ggml_nbytes(tensor));
-}
-
-static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct llama_file file(filename, "wb");
-    if (file.fp == NULL) {
-        return;
-    }
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-
-    auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix);
-        return tn_buf.data();
-    };
-
-    auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix);
-        return tn_buf.data();
-    };
-
-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
-    file.write_u32(1); // version
-    // write_hparams
-    file.write_u32(lora->hparams.lora_r);
-    file.write_u32(lora->hparams.lora_alpha);
-    // write tensors
-    write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraA"));
-    write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD,  ".weight.loraB"));
-    write_tensor(&file, lora->norm_a,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA"));
-    write_tensor(&file, lora->norm_b,           tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB"));
-    write_tensor(&file, lora->output_a,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraA"));
-    write_tensor(&file, lora->output_b,         tn(LLM_TENSOR_OUTPUT,      ".weight.loraB"));
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA"));
-        write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB"));
-        write_tensor(&file, layer.wq_a,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wq_b,             tni(LLM_TENSOR_ATTN_Q,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wk_a,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wk_b,             tni(LLM_TENSOR_ATTN_K,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wv_a,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraA"));
-        write_tensor(&file, layer.wv_b,             tni(LLM_TENSOR_ATTN_V,    i, ".weight.loraB"));
-        write_tensor(&file, layer.wo_a,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraA"));
-        write_tensor(&file, layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
-        write_tensor(&file, layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
-        write_tensor(&file, layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
-    }
-}
-
-struct train_params {
-    struct train_params_common common;
-
-    const char * fn_model_base;
-    const char * fn_lora_out;
-
-    bool only_write_lora;
-
-    float f_norm_rms_eps;
-    float rope_freq_base;
-    float rope_freq_scale;
-
-    bool custom_f_norm_rms_eps;
-    bool custom_rope_freq_base;
-    bool custom_rope_freq_scale;
-
-    int32_t lora_r;
-    int32_t lora_alpha;
-    bool custom_lora_alpha;
-
-    uint32_t n_rank_attention_norm;
-    uint32_t n_rank_wq;
-    uint32_t n_rank_wk;
-    uint32_t n_rank_wv;
-    uint32_t n_rank_wo;
-    uint32_t n_rank_ffn_norm;
-    uint32_t n_rank_ffn_gate;
-    uint32_t n_rank_ffn_down;
-    uint32_t n_rank_ffn_up;
-    uint32_t n_rank_tok_embeddings;
-    uint32_t n_rank_norm;
-    uint32_t n_rank_output;
-
-    bool custom_n_rank_attention_norm;
-    bool custom_n_rank_wq;
-    bool custom_n_rank_wk;
-    bool custom_n_rank_wv;
-    bool custom_n_rank_wo;
-    bool custom_n_rank_ffn_norm;
-    bool custom_n_rank_ffn_gate;
-    bool custom_n_rank_ffn_down;
-    bool custom_n_rank_ffn_up;
-    bool custom_n_rank_tok_embeddings;
-    bool custom_n_rank_norm;
-    bool custom_n_rank_output;
-};
-
-static struct train_params get_default_train_params() {
-    struct train_params params;
-    params.common = get_default_train_params_common();
-    params.fn_model_base     = "";
-    params.fn_lora_out       = "ggml-lora-ITERATION-f32.gguf";
-
-    params.only_write_lora = false;
-
-    params.f_norm_rms_eps  = 1e-5f;
-    params.rope_freq_base  = 10000.0f;
-    params.rope_freq_scale = 1.0f;
-
-    params.custom_f_norm_rms_eps  = false;
-    params.custom_rope_freq_base  = false;
-    params.custom_rope_freq_scale = false;
-
-    params.lora_r      = 4;
-    params.lora_alpha  = 4;
-    params.custom_lora_alpha = false;
-
-    params.n_rank_attention_norm = 1;
-    params.n_rank_wq             = 4;
-    params.n_rank_wk             = 4;
-    params.n_rank_wv             = 4;
-    params.n_rank_wo             = 4;
-    params.n_rank_ffn_norm       = 1;
-    params.n_rank_ffn_gate       = 4;
-    params.n_rank_ffn_down       = 4;
-    params.n_rank_ffn_up         = 4;
-    params.n_rank_tok_embeddings = 4;
-    params.n_rank_norm           = 1;
-    params.n_rank_output         = 4;
-
-    params.custom_n_rank_attention_norm = false;
-    params.custom_n_rank_wq             = false;
-    params.custom_n_rank_wk             = false;
-    params.custom_n_rank_wv             = false;
-    params.custom_n_rank_wo             = false;
-    params.custom_n_rank_ffn_norm       = false;
-    params.custom_n_rank_ffn_gate       = false;
-    params.custom_n_rank_ffn_down       = false;
-    params.custom_n_rank_ffn_up         = false;
-    params.custom_n_rank_tok_embeddings = false;
-    params.custom_n_rank_norm           = false;
-    params.custom_n_rank_output         = false;
-
-    return params;
-}
-
-static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-
-    fprintf(stderr, "  --model-base FNAME         model path from which to load base model (default '%s')\n", params->fn_model_base);
-    fprintf(stderr, "  --lora-out FNAME           path to save llama lora (default '%s')\n", params->fn_lora_out);
-    fprintf(stderr, "  --only-write-lora          only save llama lora, don't do any training.  use this if you only want to convert a checkpoint to a lora adapter.\n");
-    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
-    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-    fprintf(stderr, "  --lora-alpha N             LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha);
-    fprintf(stderr, "  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r);
-    fprintf(stderr, "  --rank-att-norm N          LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-ffn-norm N          LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-out-norm N          LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n");
-    fprintf(stderr, "  --rank-tok-embd N          LORA rank for token embeddings tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-out N               LORA rank for output tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wq N                LORA rank for wq tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_gate N          LORA rank for ffn_gate tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_down N          LORA rank for ffn_down tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-ffn_up N            LORA rank for ffn_up tensor, overrides default rank.\n");
-
-    print_common_train_usage(argc, argv, &params->common);
-}
-
-static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
-            if (invalid_param) {
-                break;
-            } else if (params->common.print_usage) {
-                train_print_usage(argc, argv, &default_params);
-                exit(0);
-            }
-        } else if (arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "--lora-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_lora_out = argv[i];
-        } else if (arg == "--only-write-lora") {
-            params->only_write_lora = true;
-        } else if (arg == "--norm-rms-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->f_norm_rms_eps = std::stof(argv[i]);
-            params->custom_f_norm_rms_eps = true;
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_base = std::stof(argv[i]);
-            params->custom_rope_freq_base = true;
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_scale = std::stof(argv[i]);
-            params->custom_rope_freq_scale = true;
-        } else if (arg == "--lora-alpha") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lora_alpha = std::stoi(argv[i]);
-            params->custom_lora_alpha = true;
-        } else if (arg == "--lora-r") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->lora_r = std::stoi(argv[i]);
-        } else if (arg == "--rank-att-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_attention_norm = std::stoi(argv[i]);
-            params->custom_n_rank_attention_norm = true;
-        } else if (arg == "--rank-ffn-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_norm = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_norm = true;
-        } else if (arg == "--rank-out-norm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_norm = std::stoi(argv[i]);
-            params->custom_n_rank_norm = true;
-        } else if (arg == "--rank-tok-embd") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_tok_embeddings = std::stoi(argv[i]);
-            params->custom_n_rank_tok_embeddings = true;
-        } else if (arg == "--rank-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_output = std::stoi(argv[i]);
-            params->custom_n_rank_output = true;
-        } else if (arg == "--rank-wq") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wq = std::stoi(argv[i]);
-            params->custom_n_rank_wq = true;
-        } else if (arg == "--rank-wk") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wk = std::stoi(argv[i]);
-            params->custom_n_rank_wk = true;
-        } else if (arg == "--rank-wv") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wv = std::stoi(argv[i]);
-            params->custom_n_rank_wv = true;
-        } else if (arg == "--rank-wo") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_wo = std::stoi(argv[i]);
-            params->custom_n_rank_wo = true;
-        } else if (arg == "--rank-ffn_gate") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_gate = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_gate = true;
-        } else if (arg == "--rank-ffn_down") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_down = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_down = true;
-        } else if (arg == "--rank-ffn_up") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_rank_ffn_up = std::stoi(argv[i]);
-            params->custom_n_rank_ffn_up = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    finish_processing_train_args(&params->common);
-    return true;
-}
-
-struct save_train_files_data {
-    const char            * fn_checkpoint_out;
-    const char            * fn_lora_out;
-    const char            * pattern_fn_it;
-    const char            * fn_latest;
-    struct my_llama_model * model;
-    struct my_llama_lora  * lora;
-};
-
-static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
-
-    int64_t iter = train->opt->iter;
-
-    if (strlen(data->fn_checkpoint_out) > 0) {
-        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train);
-        save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->model, data->lora, train);
-    }
-    if (strlen(data->fn_lora_out) > 0) {
-        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora);
-        save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->lora);
-    }
-}
-
-static int64_t get_parameter_count(struct my_llama_lora* lora) {
-    int64_t nx = 0;
-    nx += ggml_nelements(lora->tok_embeddings_a);
-    nx += ggml_nelements(lora->tok_embeddings_b);
-    nx += ggml_nelements(lora->norm_a);
-    nx += ggml_nelements(lora->norm_b);
-    nx += ggml_nelements(lora->output_a);
-    nx += ggml_nelements(lora->output_b);
-
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-        nx += ggml_nelements(layer.attention_norm_a);
-        nx += ggml_nelements(layer.attention_norm_b);
-        nx += ggml_nelements(layer.wq_a);
-        nx += ggml_nelements(layer.wq_b);
-        nx += ggml_nelements(layer.wk_a);
-        nx += ggml_nelements(layer.wk_b);
-        nx += ggml_nelements(layer.wv_a);
-        nx += ggml_nelements(layer.wv_b);
-        nx += ggml_nelements(layer.wo_a);
-        nx += ggml_nelements(layer.wo_b);
-        nx += ggml_nelements(layer.ffn_norm_a);
-        nx += ggml_nelements(layer.ffn_norm_b);
-        nx += ggml_nelements(layer.ffn_gate_a);
-        nx += ggml_nelements(layer.ffn_gate_b);
-        nx += ggml_nelements(layer.ffn_down_a);
-        nx += ggml_nelements(layer.ffn_down_b);
-        nx += ggml_nelements(layer.ffn_up_a);
-        nx += ggml_nelements(layer.ffn_up_b);
-    }
-    return nx;
-}
-
-int main(int argc, char ** argv) {
-    struct train_params params = get_default_train_params();
-
-    if (!train_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    if (params.common.seed == LLAMA_DEFAULT_SEED) {
-        params.common.seed = time(NULL);
-    }
-    printf("%s: seed: %u\n", __func__, params.common.seed);
-    srand(params.common.seed);
-
-    struct llama_model_params llama_mparams = llama_model_default_params();
-    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
-    llama_mparams.vocab_only = false;
-
-    printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams);
-
-    struct llama_context_params llama_cparams = llama_context_default_params();
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams);
-
-    struct my_llama_model model;
-    init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx);
-
-    struct my_llama_lora lora;
-
-    struct train_state      * train = init_train_state();
-    struct ggml_opt_context * opt   = train->opt;
-
-    // set params from command line
-    if (params.custom_f_norm_rms_eps) {
-        model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
-    }
-    if (params.custom_rope_freq_base) {
-        model.hparams.rope_freq_base  = params.rope_freq_base;
-    }
-    if (params.custom_rope_freq_scale) {
-        model.hparams.rope_freq_scale = params.rope_freq_scale;
-    }
-    lora.hparams.lora_r                = params.lora_r;
-    lora.hparams.lora_alpha            = params.custom_lora_alpha            ? params.lora_alpha            : params.lora_r;
-    uint32_t n_rank_attention_norm     = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1;
-    uint32_t n_rank_wq                 = params.custom_n_rank_wq             ? params.n_rank_wq             : params.lora_r;
-    uint32_t n_rank_wk                 = params.custom_n_rank_wk             ? params.n_rank_wk             : params.lora_r;
-    uint32_t n_rank_wv                 = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
-    uint32_t n_rank_wo                 = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
-    uint32_t n_rank_ffn_norm           = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    uint32_t n_rank_ffn_gate           = params.custom_n_rank_ffn_gate       ? params.n_rank_ffn_gate       : params.lora_r;
-    uint32_t n_rank_ffn_down           = params.custom_n_rank_ffn_down       ? params.n_rank_ffn_down       : params.lora_r;
-    uint32_t n_rank_ffn_up             = params.custom_n_rank_ffn_up         ? params.n_rank_ffn_up         : params.lora_r;
-    uint32_t n_rank_tok_embeddings     = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
-    uint32_t n_rank_norm               = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
-    uint32_t n_rank_output             = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
-    lora.hparams.n_rank_attention_norm = n_rank_attention_norm;
-    lora.hparams.n_rank_wq             = n_rank_wq;
-    lora.hparams.n_rank_wk             = n_rank_wk;
-    lora.hparams.n_rank_wv             = n_rank_wv;
-    lora.hparams.n_rank_wo             = n_rank_wo;
-    lora.hparams.n_rank_ffn_norm       = n_rank_ffn_norm;
-    lora.hparams.n_rank_ffn_gate       = n_rank_ffn_gate;
-    lora.hparams.n_rank_ffn_down       = n_rank_ffn_down;
-    lora.hparams.n_rank_ffn_up         = n_rank_ffn_up;
-    lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
-    lora.hparams.n_rank_norm           = n_rank_norm;
-    lora.hparams.n_rank_output         = n_rank_output;
-
-    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    opt->params.print_forward_graph     = false;
-    opt->params.print_backward_graph    = false;
-    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
-    opt->params.n_threads               = params.common.n_threads;
-    opt->params.past                    = params.common.opt_past;
-    opt->params.delta                   = params.common.opt_delta;
-    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
-    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
-    opt->params.adam.n_iter             = params.common.adam_n_iter;
-    opt->params.adam.sched              = 1.0f;
-    opt->params.adam.alpha              = params.common.adam_alpha;
-    opt->params.adam.decay              = params.common.adam_decay;
-    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
-    opt->params.adam.beta1              = params.common.adam_beta1;
-    opt->params.adam.beta2              = params.common.adam_beta2;
-    opt->params.adam.gclip              = params.common.adam_gclip;
-    opt->params.adam.eps_f              = params.common.adam_eps_f;
-
-    printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
-
-    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
-        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
-        }
-
-        const bool opt_param_count_changed = (
-           (lora.hparams.n_rank_attention_norm != n_rank_attention_norm)
-        || (lora.hparams.n_rank_wq             != n_rank_wq)
-        || (lora.hparams.n_rank_wk             != n_rank_wk)
-        || (lora.hparams.n_rank_wv             != n_rank_wv)
-        || (lora.hparams.n_rank_wo             != n_rank_wo)
-        || (lora.hparams.n_rank_ffn_norm       != n_rank_ffn_norm)
-        || (lora.hparams.n_rank_ffn_gate       != n_rank_ffn_gate)
-        || (lora.hparams.n_rank_ffn_down       != n_rank_ffn_down)
-        || (lora.hparams.n_rank_ffn_up         != n_rank_ffn_up)
-        || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
-        || (lora.hparams.n_rank_norm           != n_rank_norm)
-        || (lora.hparams.n_rank_output         != n_rank_output)
-        );
-
-        const bool opt_past_changed = opt->params.past != params.common.opt_past;
-
-        if (opt_param_count_changed) {
-            print_lora_params(&lora.hparams);
-            die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting.");
-            // need to discard previous optimizer gradient statistics and opt_init with new shapes
-            // TODO
-        }
-        if (opt_past_changed) {
-            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
-            // need to discard previous optimizer past function value statistics and opt_init with new shapes
-            // TODO
-        }
-    } else { // existed == false
-        init_lora(&model, &lora);
-        randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-        if (!params.only_write_lora) {
-            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora));
-        }
-    }
-    opt->iter = train->train_its;
-
-    print_params(&model.hparams);
-    print_lora_params(&lora.hparams);
-    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
-    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
-    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
-    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
-
-    if (params.only_write_lora) {
-        save_train_files_data save_data;
-        save_data.fn_checkpoint_out = "";
-        save_data.fn_lora_out       = params.fn_lora_out;
-        save_data.pattern_fn_it     = params.common.pattern_fn_it;
-        save_data.fn_latest         = params.common.fn_latest;
-        save_data.model             = &model;
-        save_data.lora              = &lora;
-
-        save_train_files(&save_data, train);
-
-        free_train_state(train);
-        ggml_free(lora.ctx);
-        llama_free(lctx);
-        llama_free_model(lmodel);
-        return 0;
-    }
-
-    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    printf("%s: opt iter %d\n", __func__, opt->iter);
-
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
-    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
-    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
-
-    // the input tensors
-    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
-    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
-    // allocate input tensors
-    // measure required memory for input tensors
-    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
-
-    // context for compute tensors without their data
-    const size_t estimated_compute_size_wo_data = (
-            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
-            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
-    );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
-    struct ggml_context * ctx_compute = NULL;
-
-    struct ggml_tensor * loss   = NULL;
-    struct ggml_tensor * logits = NULL;
-
-    struct ggml_cgraph * gf     = NULL;
-    struct ggml_cgraph * gb     = NULL;
-    struct ggml_cgraph * gb_tmp = NULL;
-
-    // measure required memory for compute tensors
-    size_t best_compute_size = SIZE_MAX;
-    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
-    // find best evaluation order
-    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
-        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gf->order = (enum ggml_cgraph_eval_order) order;
-        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-            : NULL;
-        loss = llama_build_lora_finetune_graphs(
-            &model, &lora, alloc, ctx_compute,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.common.use_flash,
-            params.common.use_checkpointing,
-            true
-        );
-        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
-        if (max_compute_size < best_compute_size) {
-            best_compute_size = max_compute_size;
-            best_order = gf->order;
-        }
-        ggml_gallocr_free(alloc);
-        ggml_free(ctx_compute);
-    }
-    size_t max_compute_size = best_compute_size;
-    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
-    printf("%s: evaluation order = %s\n", __func__,
-        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
-        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
-        "invalid");
-
-    // allocate compute tensors
-    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gf->order = best_order;
-    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gb_tmp = params.common.use_checkpointing
-        ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-        : NULL;
-    loss = llama_build_lora_finetune_graphs(
-        &model, &lora, alloc, ctx_compute,
-        gf, gb, gb_tmp,
-        &logits, tokens_input, target_probs,
-        n_tokens, n_batch,
-        params.common.use_flash,
-        params.common.use_checkpointing,
-        false
-    );
-
-    // tokenize data
-    std::vector<llama_token> train_tokens;
-    std::vector<size_t> train_samples_begin;
-    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
-    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
-    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
-    tokenize_file(lctx,
-            params.common.fn_train_data,
-            params.common.sample_start,
-            params.common.include_sample_start,
-            params.common.overlapping_samples,
-            n_tokens,
-            train_tokens,
-            train_samples_begin,
-            train_samples_size);
-    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
-
-    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
-
-    std::vector<size_t> token_noccurs;
-    token_noccurs.resize(model.hparams.n_vocab, 0);
-    for (unsigned int i = 0; i < train_tokens.size(); ++i) {
-        ++token_noccurs[train_tokens[i]];
-    }
-    int n_unique_tokens = 0;
-    for (unsigned int i = 0; i < token_noccurs.size(); ++i) {
-        if (token_noccurs[i] == 0) continue;
-        ++n_unique_tokens;
-    }
-    printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
-
-    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
-    if (changed_train_data) {
-        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
-    }
-    if (params.common.force_reshuffle) {
-        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
-    }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
-        train->shuffle_sample_count = train_samples_size.size();
-        train->shuffle_next_sample = 0;
-        train->shuffle_samples_hash = shuffle_samples_hash;
-    }
-    std::vector<size_t> train_shuffled_samples_offs;
-    std::vector<size_t> train_shuffled_samples_begin;
-    std::vector<size_t> train_shuffled_samples_size;
-    train_shuffled_samples_offs.resize(train_samples_begin.size());
-    train_shuffled_samples_begin.resize(train_samples_begin.size());
-    train_shuffled_samples_size.resize(train_samples_size.size());
-    train->shuffle_rng_state_next = shuffle_samples(
-        train->shuffle_rng_state_current,
-        train_shuffled_samples_offs.data(),
-        train_shuffled_samples_begin.data(),
-        train_shuffled_samples_size.data(),
-        train_samples_begin.data(),
-        train_samples_size.data(),
-        train_samples_size.size());
-
-    printf("%s: begin training\n", __func__);
-
-    save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
-    save_data.fn_lora_out       = params.fn_lora_out;
-    save_data.pattern_fn_it     = params.common.pattern_fn_it;
-    save_data.fn_latest         = params.common.fn_latest;
-    save_data.model             = &model;
-    save_data.lora              = &lora;
-
-    struct train_opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params.common;
-    opt_cb_data.train                  = train;
-    opt_cb_data.save_cb                = &save_train_files;
-    opt_cb_data.save_data              = &save_data;
-    opt_cb_data.lctx                   = lctx;
-    opt_cb_data.last_save_iter         = opt->iter;
-    opt_cb_data.tokens_data            = train_tokens.data();
-    opt_cb_data.tokens_size            = train_tokens.size();
-    opt_cb_data.samples_begin          = train_samples_begin.data();
-    opt_cb_data.samples_size           = train_samples_size.data();
-    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
-    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
-    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
-    opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input           = tokens_input;
-    opt_cb_data.target_probs           = target_probs;
-    opt_cb_data.first_iter             = opt->iter;
-    opt_cb_data.first_epoch            = train->train_epochs;
-    opt_cb_data.iter_at_last_epoch     = -1;
-    opt_cb_data.last_time              = ggml_time_ms();
-    opt_cb_data.millis_per_iter        = 0.0;
-
-    // measure required memory for work buffer
-    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
-
-    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
-    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
-
-    int64_t t0 = ggml_time_ms();
-
-    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_compute);
-    ggml_free(ctx_input);
-    ggml_gallocr_free(alloc);
-
-
-    int64_t t1 = ggml_time_ms();
-    printf("%s: total training time: ", __func__);
-    print_duration((double) (t1 - t0));
-    printf("\n");
-
-    int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    if (new_iters > 0) {
-        train->train_its     += new_iters;
-        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
-
-        save_train_files(&save_data, train);
-        opt_cb_data.last_save_iter = opt->iter;
-    }
-
-    ggml_free(opt->ctx);
-    free_train_state(train);
-    ggml_free(lora.ctx);
-    llama_free(lctx);
-    llama_free_model(lmodel);
-    return 0;
-}
diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh
deleted file mode 100644
index e3cc7f271..000000000
--- a/examples/finetune/finetune.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-cd ../..
-
-EXE="./llama-finetune"
-
-if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
-if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
-
-# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
-
-while getopts "dg" opt; do
-  case $opt in
-    d)
-      DEBUGGER="gdb --args"
-      ;;
-    g)
-      EXE="./build/bin/Release/finetune"
-      GPUARG="--gpu-layers 25"
-      ;;
-  esac
-done
-
-$DEBUGGER $EXE \
-        --model-base $MODEL \
-        $GPUARG \
-        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
-        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
-        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
-        --save-every 10 \
-        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index dd53ba9b1..48a705e15 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
     auto decoded = decode_utf8(input_str, {});
     const auto & code_points = decoded.first;
 
+    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
+          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
+
     size_t pos = 0;
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        auto prev_stacks = grammar->stacks;
-        llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
-        if (grammar->stacks.empty()) {
+        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
+
+        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
+
+        if (cur_stacks.empty()) {
             error_pos = pos;
             error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
-            grammar->stacks = prev_stacks;
+            cur_stacks = prev_stacks;
             return false;
         }
         ++pos;
     }
 
-    for (const auto & stack : grammar->stacks) {
+    for (const auto & stack : cur_stacks) {
         if (stack.empty()) {
             return true;
         }
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index 575143771..7498f85ef 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
 
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
 
+    if (!ctx) {
+        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
     printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
     printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
     printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 29602881a..bb5faec94 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/imatrix
 
-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
 More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
 
 ## Usage
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 92a6b16b1..2aafe2316 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 
     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
     if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-        return env->NewStringUTF("");
+        return nullptr;
     }
 
     auto new_token_chars = llama_token_to_piece(context, new_token_id);
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 2a3f9f758..58c32ca53 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -26,11 +26,12 @@ actor LlamaContext {
     private var context: OpaquePointer
     private var batch: llama_batch
     private var tokens_list: [llama_token]
+    var is_done: Bool = false
 
     /// This variable is used to store temporarily invalid cchars
     private var temporary_invalid_cchars: [CChar]
 
-    var n_len: Int32 = 64
+    var n_len: Int32 = 1024
     var n_cur: Int32 = 0
 
     var n_decode: Int32 = 0
@@ -160,6 +161,7 @@ actor LlamaContext {
 
         if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
             print("\n")
+            is_done = true
             let new_token_str = String(cString: temporary_invalid_cchars + [0])
             temporary_invalid_cchars.removeAll()
             return new_token_str
diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
index 2c1e3f61b..b8f6a31d5 100644
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
         messageLog += "\(text)"
 
         Task.detached {
-            while await llamaContext.n_cur < llamaContext.n_len {
+            while await !llamaContext.is_done {
                 let result = await llamaContext.completion_loop()
                 await MainActor.run {
                     self.messageLog += "\(result)"
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index a0d817b1a..61e960ea2 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
     auto formatted = llama_chat_format_single(
         model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
+    LOG("formatted: %s\n", formatted.c_str());
     return formatted;
 }
 
diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py
old mode 100644
new mode 100755
index 504ed98df..eb000d5cc
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@@ -1,8 +1,15 @@
-# Function calling example using pydantic models.
+#!/usr/bin/env python3
+
+"""Function calling example using pydantic models."""
+
 from __future__ import annotations
 
+import argparse
 import datetime
 import json
+import logging
+import textwrap
+import sys
 from enum import Enum
 from typing import Optional, Union
 
@@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
                                         create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
 
 
-# Function to get completion on the llama.cpp server with grammar.
-def create_completion(prompt, grammar):
+def create_completion(host, prompt, gbnf_grammar):
+    """Calls the /completion API on llama-server.
+
+    See
+    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
+    """
+    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
     headers = {"Content-Type": "application/json"}
-    data = {"prompt": prompt, "grammar": grammar}
-
-    response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
-    data = response.json()
-
+    data = {"prompt": prompt, "grammar": gbnf_grammar}
+    result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
     assert data.get("error") is None, data
-
-    print(data["content"])
-    return data["content"]
+    logging.info("Result: %s", result)
+    content = result["content"]
+    print(f"  Model: {result['model']}")
+    print(f"  Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), '    ')}")
+    return content
 
 
 # A function for the agent to send a message to the user.
 class SendMessageToUser(BaseModel):
-    """
-    Send a message to the User.
-    """
+    """Send a message to the User."""
     chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
     message: str = Field(..., description="Message you want to send to the user.")
 
     def run(self):
-        print(self.message)
+        print(f"SendMessageToUser: {self.message}")
+
+
+def example_rce(host):
+    """Minimal test case where the LLM call an arbitrary python function."""
+    print("- example_rce")
+    tools = [SendMessageToUser]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+        pydantic_model_list=tools, outer_object_name="function",
+        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
+    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
+    user_message = "What is 42 * 42?"
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    tools_map = {tool.__name__:tool for tool in tools}
+    # This finds "SendMessageToUser":
+    tool = tools_map.get(json_data["function"])
+    if not tool:
+        print(f"Error: unknown tool {json_data['function']}")
+        return 1
+    tool(**json_data["function_parameters"]).run()
+    return 0
 
 
 # Enum for the calculator tool.
@@ -46,11 +77,11 @@ class MathOperation(Enum):
     DIVIDE = "divide"
 
 
-# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
+# Simple pydantic calculator tool for the agent that can add, subtract,
+# multiply, and divide. Docstring and description of fields will be used in
+# system prompt.
 class Calculator(BaseModel):
-    """
-    Perform a math operation on two numbers.
-    """
+    """Perform a math operation on two numbers."""
     number_one: Union[int, float] = Field(..., description="First number.")
     operation: MathOperation = Field(..., description="Math operation to perform.")
     number_two: Union[int, float] = Field(..., description="Second number.")
@@ -68,55 +99,61 @@ class Calculator(BaseModel):
             raise ValueError("Unknown operation.")
 
 
-# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
-# pydantic_model_list is the list of pydanitc models
-# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
-# outer_object_content is the name of outer object content.
-# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
-# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-    pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
-    outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
+def example_calculator(host):
+    """Have the LLM ask to get a calculation done.
 
-print(gbnf_grammar)
-print(documentation)
+    Here the grammar gets generated by passing the available function models to
+    generate_gbnf_grammar_and_documentation function. This also generates a
+    documentation usable by the LLM.
 
-system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
+    pydantic_model_list is the list of pydantic models outer_object_name is an
+    optional name for an outer object around the actual model object. Like a
+    "function" object with "function_parameters" which contains the actual model
+    object. If None, no outer object will be generated outer_object_content is
+    the name of outer object content.
 
-user_message = "What is 42 * 42?"
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
-
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
-# This should output something like this:
-# {
-#     "function": "calculator",
-#     "function_parameters": {
-#         "number_one": 42,
-#         "operation": "multiply",
-#         "number_two": 42
-#     }
-# }
-function_dictionary = json.loads(text)
-if function_dictionary["function"] == "calculator":
-    function_parameters = {**function_dictionary["function_parameters"]}
-
-    print(Calculator(**function_parameters).run())
-    # This should output: 1764
+    model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
+    fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
+    """
+    print("- example_calculator")
+    tools = [SendMessageToUser, Calculator]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+        pydantic_model_list=tools, outer_object_name="function",
+        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
+    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
+    user_message1 = "What is 42 * 42?"
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    expected = {
+        "function": "Calculator",
+        "function_parameters": {
+            "number_one": 42,
+            "operation": "multiply",
+            "number_two": 42
+        }
+    }
+    if json_data != expected:
+        print("  Result is not as expected!")
+    tools_map = {tool.__name__:tool for tool in tools}
+    # This finds "Calculator":
+    tool = tools_map.get(json_data["function"])
+    if not tool:
+        print(f"Error: unknown tool {json_data['function']}")
+        return 1
+    result = tool(**json_data["function_parameters"]).run()
+    print(f"  Call {json_data['function']} gave result {result}")
+    return 0
 
 
-# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
 class Category(Enum):
-    """
-    The category of the book.
-    """
+    """The category of the book."""
     Fiction = "Fiction"
     NonFiction = "Non-Fiction"
 
 
 class Book(BaseModel):
-    """
-    Represents an entry about a book.
-    """
+    """Represents an entry about a book."""
     title: str = Field(..., description="Title of the book.")
     author: str = Field(..., description="Author of the book.")
     published_year: Optional[int] = Field(..., description="Publishing year of the book.")
@@ -125,33 +162,42 @@ class Book(BaseModel):
     summary: str = Field(..., description="Summary of the book.")
 
 
-# We need no additional parameters other than our list of pydantic models.
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
+def example_struct(host):
+    """A example structured output based on pydantic models.
 
-system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
+    The LLM will create an entry for a Book database out of an unstructured
+    text. We need no additional parameters other than our list of pydantic
+    models.
+    """
+    print("- example_struct")
+    tools = [Book]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
+    system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
+    text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    # In this case, there's no function nor function_parameters.
+    # Here the result will vary based on the LLM used.
+    keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
+    if keys != sorted(json_data.keys()):
+        print(f"Unexpected result: {sorted(json_data.keys())}")
+        return 1
+    book = Book(**json_data)
+    print(f"  As a Book object: %s" % book)
+    return 0
 
-text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
-
-json_data = json.loads(text)
-
-print(Book(**json_data))
-# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
 
 def get_current_datetime(output_format: Optional[str] = None):
-    """
-    Get the current date and time in the given format.
+    """Get the current date and time in the given format.
+
     Args:
          output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
     """
-    if output_format is None:
-        output_format = '%Y-%m-%d %H:%M:%S'
-    return datetime.datetime.now().strftime(output_format)
+    return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
 
 
-# Example function to get the weather
+# Example function to get the weather.
 def get_current_weather(location, unit):
     """Get the current weather in a given location"""
     if "London" in location:
@@ -160,68 +206,107 @@ def get_current_weather(location, unit):
         return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
     elif "North Pole" in location:
         return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
-    else:
-        return json.dumps({"location": location, "temperature": "unknown"})
+    return json.dumps({"location": location, "temperature": "unknown"})
 
 
-# Here is a function definition in OpenAI style
-current_weather_tool = {
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "location": {
-                    "type": "string",
-                    "description": "The city and state, e.g. San Francisco, CA",
+def example_concurrent(host):
+    """An example for parallel function calling with a Python function, a pydantic
+    function model and an OpenAI like function definition.
+    """
+    print("- example_concurrent")
+    # Function definition in OpenAI style.
+    current_weather_tool = {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                 },
-                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                "required": ["location"],
             },
-            "required": ["location"],
         },
-    },
-}
+    }
+    # Convert OpenAI function definition into pydantic model.
+    current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
+    # Add the actual function to a pydantic model.
+    current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
 
-# Convert OpenAI function definition into pydantic model
-current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
-# Add the actual function to a pydantic model
-current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
+    # Convert normal Python function to a pydantic model.
+    current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
 
-# Convert normal Python function to a pydantic model
-current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
-
-tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
+    tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
+    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+        pydantic_model_list=tools, outer_object_name="function",
+        outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
+    system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
+    text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
+    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+    text = create_completion(host, prompt, gbnf_grammar)
+    json_data = json.loads(text)
+    expected = [
+      {
+        "function": "get_current_datetime",
+        "params": {
+          "output_format": "%Y-%m-%d %H:%M:%S"
+        }
+      },
+      {
+        "function": "get_current_weather",
+        "params": {
+          "location": "London",
+          "unit": "celsius"
+        }
+      },
+      {
+        "function": "Calculator",
+        "params": {
+          "number_one": 42,
+          "operation": "multiply",
+          "number_two": 42
+        }
+      }
+    ]
+    res = 0
+    if json_data != expected:
+        print("  Result is not as expected!")
+        print("  This can happen on highly quantized models")
+        res = 1
+    tools_map = {tool.__name__:tool for tool in tools}
+    for call in json_data:
+      tool = tools_map.get(call["function"])
+      if not tool:
+          print(f"Error: unknown tool {call['function']}")
+          return 1
+      result = tool(**call["params"]).run()
+      print(f"  Call {call['function']} returned {result}")
+    # Should output something like this:
+    #   Call get_current_datetime returned 2024-07-15 09:50:38
+    #   Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
+    #   Call Calculator returned 1764
+    return res
 
 
-gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-    pydantic_model_list=tool_list, outer_object_name="function",
-    outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
-
-system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
+def main():
+    parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
+    parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
+    parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
+    ret = 0
+    # Comment out below to only run the example you want.
+    ret = ret or example_rce(args.host)
+    ret = ret or example_calculator(args.host)
+    ret = ret or example_struct(args.host)
+    ret = ret or example_concurrent(args.host)
+    return ret
 
 
-text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
-prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-
-text = create_completion(prompt=prompt, grammar=gbnf_grammar)
-
-json_data = json.loads(text)
-
-print(json_data)
-# Should output something like this:
-# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
-
-
-for call in json_data:
-    if call["function"] == "Calculator":
-        print(Calculator(**call["params"]).run())
-    elif call["function"] == "get_current_datetime":
-        print(current_datetime_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
-    elif call["function"] == "get_current_weather":
-        print(current_weather_tool_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
-# Should output something like this:
-# 2024-01-14 13:36:06
-# {"location": "London", "temperature": "42", "unit": "celsius"}
-# 1764
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/examples/server/README.md b/examples/server/README.md
index e477d1501..33a2b95cc 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
 Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 
 **Features:**
- * LLM inference of F16 and quantum models on GPU and CPU
+ * LLM inference of F16 and quantized models on GPU and CPU
  * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
  * Parallel decoding with multi-user support
  * Continuous batching
@@ -444,7 +444,7 @@ node index.js
 
     `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
 
-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
     By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
 
     `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index bf2b0a3f0..c87dd8f1e 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -225,7 +225,7 @@
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
         const data = chunk.data;
         if (data.stop) {
           while (
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index a15424613..07fec6a38 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -1,5 +1,4 @@
 <html>
-
 <head>
   <meta charset="UTF-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
@@ -132,12 +131,20 @@
       align-items: stretch;
     }
 
-    .right {
+    .message-controls {
       display: flex;
-      flex-direction: row;
-      gap: 0.5em;
       justify-content: flex-end;
     }
+    .message-controls > div:nth-child(2) {
+      display: flex;
+      flex-direction: column;
+      gap: 0.5em;
+    }
+    .message-controls > div:nth-child(2) > div {
+      display: flex;
+      margin-left: auto;
+      gap: 0.5em;
+    }
 
     fieldset {
       border: none;
@@ -276,6 +283,7 @@
 
     import { llama } from './completion.js';
     import { SchemaConverter } from './json-schema-to-grammar.mjs';
+
     let selected_image = false;
     var slot_id = -1;
 
@@ -447,6 +455,9 @@
 
     /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
 
+    const tts = window.speechSynthesis;
+    const ttsVoice = signal(null)
+
     const llamaStats = signal(null)
     const controller = signal(null)
 
@@ -479,7 +490,7 @@
         throw new Error("already running");
       }
       controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
         const data = chunk.data;
 
         if (data.stop) {
@@ -596,8 +607,51 @@
       });
     }
 
+    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+    const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
     function MessageInput() {
-      const message = useSignal("")
+      const message = useSignal("");
+
+      const talkActive = useSignal(false);
+      const sendOnTalk = useSignal(false);
+      const talkStop = (e) => {
+        if (e) e.preventDefault();
+
+        talkActive.value = false;
+        talkRecognition?.stop();
+      }
+      const talk = (e) => {
+        e.preventDefault();
+
+        if (talkRecognition)
+          talkRecognition.start();
+        else
+          alert("Speech recognition is not supported by this browser.");
+      }
+      if(talkRecognition) {
+        talkRecognition.onstart = () => {
+          talkActive.value = true;
+        }
+        talkRecognition.onresult = (e) => {
+          if (event.results.length > 0) {
+            message.value = event.results[0][0].transcript;
+            if (sendOnTalk.value) {
+              submit(e);
+            }
+          }
+        }
+        talkRecognition.onspeechend = () => {
+          talkStop();
+        }
+      }
+
+      const ttsVoices = useSignal(tts?.getVoices() || []);
+      const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
+      if (tts) {
+        tts.onvoiceschanged = () => {
+          ttsVoices.value = tts.getVoices();
+        }
+      }
 
       const submit = (e) => {
         stop(e);
@@ -624,11 +678,45 @@
                value="${message}"
             />
           </div>
-          <div class="right">
-            <button type="submit" disabled=${generating.value}>Send</button>
-            <button onclick=${uploadImage}>Upload Image</button>
-            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
-            <button onclick=${reset}>Reset</button>
+          <div class="message-controls">
+            <div> </div>
+            <div>
+              <div>
+                <button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
+                <button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
+                <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+                <button onclick=${reset}>Reset</button>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
+                  `(TTS and speech recognition are not provided by llama.cpp)\n` +
+                  `Note: STT requires HTTPS to work.`);
+                }}>[?]</a>
+                <button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
+                <div>
+                  <input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
+                  <label for="send-on-talk" style="line-height: initial;">Send after talking</label>
+                </div>
+              </div>
+              <div>
+                <a href="#" style="cursor: help;" title="Help" onclick=${e => {
+                  e.preventDefault();
+                  alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
+                }}>[?]</a>
+                <label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
+                <select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
+                  <option value="" selected="${!ttsVoice.value}">None</option>
+                  ${[
+                    ...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
+                    ...ttsVoices.value.filter(v => !v.default),
+                  ].map(
+                    v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
+                  )}
+                </select>
+              </div>
+            </div>
           </div>
         </form>
       `
@@ -659,26 +747,86 @@
         }
       }, [messages])
 
+      const ttsChatLineActiveIx = useSignal(undefined);
+      const ttsChatLine = (e, ix, msg) => {
+        if (e) e.preventDefault();
+
+        if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
+
+        const ttsVoices = tts.getVoices();
+        const voice = ttsVoices.find(v => v.name === ttsVoice.value);
+        if (!voice) return;
+
+        if (ttsChatLineActiveIx.value !== undefined) {
+          tts.cancel();
+          if (ttsChatLineActiveIx.value === ix) {
+            ttsChatLineActiveIx.value = undefined;
+            return;
+          }
+        }
+
+        ttsChatLineActiveIx.value = ix;
+        let ttsUtter = new SpeechSynthesisUtterance(msg);
+        ttsUtter.voice = voice;
+        ttsUtter.onend = e => {
+          ttsChatLineActiveIx.value = undefined;
+        };
+        tts.speak(ttsUtter);
+      }
+
       const isCompletionMode = session.value.type === 'completion'
+
+      // Try play the last bot message
+      const lastCharChatLinesIxs = useSignal([]);
+      const lastCharChatLinesIxsOld = useSignal([]);
+      useEffect(() => {
+        if (
+          !isCompletionMode
+          && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
+          && !generating.value
+        ) {
+          const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
+          if (ix !== undefined) {
+            const msg = messages[ix];
+            ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
+          }
+
+          lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
+        }
+      }, [generating.value]);
+
       const chatLine = ([user, data], index) => {
         let message
-        const isArrayMessage = Array.isArray(data)
+        const isArrayMessage = Array.isArray(data);
+        const text = isArrayMessage ?
+            data.map(msg => msg.content).join('') :
+            data;
         if (params.value.n_probs > 0 && isArrayMessage) {
           message = html`<${Probabilities} data=${data} />`
         } else {
-          const text = isArrayMessage ?
-            data.map(msg => msg.content).join('') :
-            data;
           message = isCompletionMode ?
             text :
             html`<${Markdownish} text=${template(text)} />`
         }
+
+        const fromBot = user && user === '{{char}}';
+        if (fromBot && !lastCharChatLinesIxs.value.includes(index))
+          lastCharChatLinesIxs.value.push(index);
+
         if (user) {
-          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+          return html`
+          <div>
+            <p key=${index}><strong>${template(user)}:</strong> ${message}</p>
+            ${
+              fromBot && ttsVoice.value
+              && html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
+            }
+          </div>
+          `;
         } else {
           return isCompletionMode ?
             html`<span key=${index}>${message}</span>` :
-            html`<p key=${index}>${message}</p>`
+            html`<div><p key=${index}>${message}</p></div>`
         }
       };
 
diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt
deleted file mode 100644
index 9a1d2a35e..000000000
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-train-text-from-scratch)
-add_executable(${TARGET} train-text-from-scratch.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md
deleted file mode 100644
index 3abae2380..000000000
--- a/examples/train-text-from-scratch/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# train-text-from-scratch
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# train
-./bin/llama-train-text-from-scratch \
-        --vocab-model ../models/ggml-vocab-llama.gguf \
-        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
-        --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
-        --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
-        --train-data "shakespeare.txt" \
-        -t 6 -b 16 --seed 1 --adam-iter 256 \
-        --no-checkpointing
-
-# predict
-./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
-```
-
-Output files will be saved every N iterations (config with `--save-every N`).
-The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
-
-To train GGUF models just pass them to `--checkpoint-in FN`.
diff --git a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
deleted file mode 100644
index e045beb72..000000000
--- a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
+++ /dev/null
@@ -1,499 +0,0 @@
-#!/usr/bin/env python3
-# train-text-from-scratch checkpoint --> gguf conversion
-
-import argparse
-import os
-import struct
-import sys
-import numpy as np
-from pathlib import Path
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
-import gguf
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.prod(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        assert(tuple(ne) == tuple(self.ne))
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationParamsV0:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.type                 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_threads            = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.past                 = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.delta                = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.adam_n_iter          = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_sched           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_decay           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_alpha           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta1           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta2           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_f           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_g           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_n_iter         = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_eps            = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_ftol           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_wolfe          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_min_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_linesearch     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version == 0:
-            params = OptimizationParamsV0()
-            offset = params.load(data, offset)
-            self.past = params.past
-            self.lbfgs_m = params.lbfgs_m
-            self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-            self.type = params.type
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            if self.type == 0:
-                # these tensors are stored, but we don't need their data
-                x  = Tensor('f', [self.nx])
-                g  = Tensor('f', [self.nx])
-                g2 = Tensor('f', [self.nx])
-                mh = Tensor('f', [self.nx])
-                vh = Tensor('f', [self.nx])
-
-                offset = x.load(data, offset)
-                offset = g.load(data, offset)
-                offset = g2.load(data, offset)
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = mh.load(data, offset)
-                offset = vh.load(data, offset)
-                offset = self.adam_pf.load(data, offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            else:
-                raise ValueError('Unknown optimizer type')
-
-
-        elif self.version == 1:
-            self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            # forgot to save type in version 1:
-            # guess self.type from number of remaining bytes
-            size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                    [self.adam_m, self.adam_v]
-                                    +([self.adam_pf] if (self.past > 0) else [])])
-            size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                    [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                     self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                     self.lbfgs_lmal, self.lbfgs_lmys,
-                                     self.lbfgs_lms, self.lbfgs_lmy]
-                                     +([self.lbfgs_pf] if (self.past > 0) else [])])
-            # due to alignment padding the size might not by exact
-            # but the difference in size for both types is significant,
-            # so we can just use whichever is closest
-            remaining = len(data) - offset
-            if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-                self.type = 0
-            else:
-                self.type = 1
-
-            if self.type == 0:
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = self.adam_pf.load(data,offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError('Invalid version of checkpoint file')
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class ModelParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-        return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
-
-class Layer:
-    def __init__(self, params, bid):
-        self.bid = bid
-        self.att_norm = Tensor('f', [params.n_embd])
-        self.wq       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wk       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wv       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wo       = Tensor('f', [params.n_embd, params.n_embd])
-        self.ffn_norm = Tensor('f', [params.n_embd])
-        self.w1       = Tensor('f', [params.n_embd, params.get_n_ff()])
-        self.w2       = Tensor('f', [params.get_n_ff(), params.n_embd])
-        self.w3       = Tensor('f', [params.n_embd, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm.load(data, offset)
-        offset = self.wq.load(data, offset)
-        offset = self.wk.load(data, offset)
-        offset = self.wv.load(data, offset)
-        offset = self.wo.load(data, offset)
-        offset = self.ffn_norm.load(data, offset)
-        offset = self.w1.load(data, offset)
-        offset = self.w2.load(data, offset)
-        offset = self.w3.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
-        self.wq.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid))
-        self.wk.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid))
-        self.wv.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid))
-        self.wo.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid))
-        self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid))
-        self.w1.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid))
-        self.w2.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid))
-        self.w3.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid))
-
-class Model:
-    def __init__(self):
-        self.params = ModelParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-
-        self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-        self.norm     = Tensor('f', [self.params.n_embd])
-        self.output   = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-
-        offset = self.tok_embd.load(data, offset)
-        offset = self.norm.load(data, offset)
-        offset = self.output.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-
-        self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
-        self.norm.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
-        self.output.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class Checkpoint:
-    def __init__(self):
-        self.model = Model()
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcp':
-            raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = Checkpoint()
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
deleted file mode 100644
index b779f6bd4..000000000
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ /dev/null
@@ -1,1253 +0,0 @@
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "common.h"
-#include "train.h"
-#include "llama.h"
-#include <unordered_map>
-#include <vector>
-#include <cassert>
-#include <climits>
-#include <cstring>
-#include <cstdarg>
-#include <ctime>
-#include <random>
-#include <stdexcept>
-#include <algorithm>
-#include <string>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;
-    uint32_t n_embd  = 4096;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-    uint32_t n_ff    = 11008;
-
-    // float f_norm_eps     = 1e-5f; // falcon
-    float f_norm_rms_eps = 1e-5f; // llama
-
-    float rope_freq_base  = 10000.0f;
-    float rope_freq_scale = 1.0f;
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-};
-
-struct my_llama_model {
-    struct ggml_context * ctx = NULL;
-    ggml_backend_buffer_t data = NULL;
-
-    my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-};
-
-// gguf constants (sync with gguf.py)
-static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
-static const char * LLM_KV_TRAINING_TYPE                 = "training.type";
-
-static const char * LLM_KV_GENERAL_NAME                = "general.name";
-static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
-static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";
-
-static const char * LLM_KV_CONTEXT_LENGTH              = "%s.context_length";
-static const char * LLM_KV_EMBEDDING_LENGTH            = "%s.embedding_length";
-static const char * LLM_KV_BLOCK_COUNT                 = "%s.block_count";
-static const char * LLM_KV_FEED_FORWARD_LENGTH         = "%s.feed_forward_length";
-static const char * LLM_KV_ATTENTION_HEAD_COUNT        = "%s.attention.head_count";
-static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
-static const char * LLM_KV_ROPE_DIMENSION_COUNT        = "%s.rope.dimension_count";
-static const char * LLM_KV_ROPE_FREQ_BASE              = "%s.rope.freq_base"; // TODO load in llama.cpp
-static const char * LLM_KV_ROPE_SCALE_LINEAR           = "%s.rope.scale_linear";
-
-static const char * LLM_KV_TOKENIZER_MODEL             = "tokenizer.ggml.model";
-static const char * LLM_KV_TOKENIZER_LIST              = "tokenizer.ggml.tokens";
-static const char * LLM_KV_TOKENIZER_TOKEN_TYPE        = "tokenizer.ggml.token_type";
-static const char * LLM_KV_TOKENIZER_SCORES            = "tokenizer.ggml.scores";
-static const char * LLM_KV_TOKENIZER_MERGES            = "tokenizer.ggml.merges";
-static const char * LLM_KV_TOKENIZER_BOS_ID            = "tokenizer.ggml.bos_token_id";
-static const char * LLM_KV_TOKENIZER_EOS_ID            = "tokenizer.ggml.eos_token_id";
-static const char * LLM_KV_TOKENIZER_UNK_ID            = "tokenizer.ggml.unknown_token_id";
-static const char * LLM_KV_TOKENIZER_SEP_ID            = "tokenizer.ggml.seperator_token_id";
-static const char * LLM_KV_TOKENIZER_PAD_ID            = "tokenizer.ggml.padding_token_id";
-
-static const char * LLM_TENSOR_TOKEN_EMBD    = "token_embd";
-static const char * LLM_TENSOR_OUTPUT_NORM   = "output_norm";
-static const char * LLM_TENSOR_OUTPUT        = "output";
-static const char * LLM_TENSOR_ATTN_NORM     = "blk.%d.attn_norm";
-static const char * LLM_TENSOR_ATTN_Q        = "blk.%d.attn_q";
-static const char * LLM_TENSOR_ATTN_K        = "blk.%d.attn_k";
-static const char * LLM_TENSOR_ATTN_V        = "blk.%d.attn_v";
-static const char * LLM_TENSOR_ATTN_OUT      = "blk.%d.attn_output";
-static const char * LLM_TENSOR_FFN_NORM      = "blk.%d.ffn_norm";
-static const char * LLM_TENSOR_FFN_GATE      = "blk.%d.ffn_gate";
-static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
-static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
-
-static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
-}
-
-static void set_param_model(struct my_llama_model * model) {
-    const auto& hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct ggml_context* ctx = model->ctx;
-
-    ggml_set_param(ctx, model->tok_embeddings);
-    ggml_set_param(ctx, model->norm);
-    ggml_set_param(ctx, model->output);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_set_param(ctx, layer.attention_norm);
-        ggml_set_param(ctx, layer.wq);
-        ggml_set_param(ctx, layer.wk);
-        ggml_set_param(ctx, layer.wv);
-        ggml_set_param(ctx, layer.wo);
-        ggml_set_param(ctx, layer.ffn_norm);
-        ggml_set_param(ctx, layer.ffn_gate);
-        ggml_set_param(ctx, layer.ffn_down);
-        ggml_set_param(ctx, layer.ffn_up);
-    }
-}
-
-static void init_model(struct my_llama_model * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-    const uint32_t n_ff    = hparams.n_ff;
-
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-    // context for model tensors without their data
-    struct ggml_init_params ctx_model_params;
-    ctx_model_params.mem_size   = ggml_tensor_overhead()*2*(6 + n_layer*18);
-    ctx_model_params.mem_buffer = NULL;
-    ctx_model_params.no_alloc   = true;
-
-    struct ggml_context * ctx = ggml_init(ctx_model_params);
-    model->ctx = ctx;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-
-    ggml_set_name(model->tok_embeddings, tn(LLM_TENSOR_TOKEN_EMBD));
-    ggml_set_name(model->norm,           tn(LLM_TENSOR_OUTPUT_NORM));
-    ggml_set_name(model->output,         tn(LLM_TENSOR_OUTPUT));
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-        layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32,   n_ff, n_embd);
-        layer.ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,   n_ff);
-
-        ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
-
-        ggml_set_name(layer.wq,             tni(LLM_TENSOR_ATTN_Q, i));
-        ggml_set_name(layer.wk,             tni(LLM_TENSOR_ATTN_K, i));
-        ggml_set_name(layer.wv,             tni(LLM_TENSOR_ATTN_V, i));
-        ggml_set_name(layer.wo,             tni(LLM_TENSOR_ATTN_OUT, i));
-
-        ggml_set_name(layer.ffn_norm,       tni(LLM_TENSOR_FFN_NORM, i));
-
-        ggml_set_name(layer.ffn_gate,       tni(LLM_TENSOR_FFN_GATE, i));
-        ggml_set_name(layer.ffn_down,       tni(LLM_TENSOR_FFN_DOWN, i));
-        ggml_set_name(layer.ffn_up,         tni(LLM_TENSOR_FFN_UP, i));
-    }
-
-    set_param_model(model);
-
-    // allocate data
-    model->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-}
-
-static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_layer = hparams.n_layer;
-
-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings, rnd);
-    randomize_tensor_normal(model->norm,           rnd);
-    randomize_tensor_normal(model->output,         rnd);
-
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
-
-        randomize_tensor_normal(layer.wq, rnd);
-        randomize_tensor_normal(layer.wk, rnd);
-        randomize_tensor_normal(layer.wv, rnd);
-        randomize_tensor_normal(layer.wo, rnd);
-
-        randomize_tensor_normal(layer.ffn_norm, rnd);
-
-        randomize_tensor_normal(layer.ffn_gate, rnd);
-        randomize_tensor_normal(layer.ffn_down, rnd);
-        randomize_tensor_normal(layer.ffn_up,   rnd);
-    }
-
-    free_random_normal_distribution(rnd);
-}
-
-static struct ggml_tensor * llama_build_train_graphs(
-        struct my_llama_model * model,
-        ggml_gallocr_t          alloc,
-        struct ggml_context   * ctx,
-        struct ggml_cgraph    * gf,
-        struct ggml_cgraph    * gb,
-        struct ggml_cgraph    * gb_tmp,
-        struct ggml_tensor  * * logits,
-        struct ggml_tensor    * tokens_input,
-        struct ggml_tensor    * targets,
-        const  int              n_tokens,
-        const  int              n_batch,
-        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing,
-        const  bool             measure_only) {
-
-    ggml_set_scratch(ctx, { 0, 0, nullptr, });
-    const int n_past = 0;
-    const int N = n_tokens;
-    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
-    const int n_vocab    = hparams.n_vocab;
-    const int n_embd     = hparams.n_embd;
-    const int n_layer    = hparams.n_layer;
-    const int n_head     = hparams.n_head;
-    const int n_rot      = hparams.n_rot;
-    const int n_ff       = hparams.n_ff;
-    const float f_norm_rms_eps  = hparams.f_norm_rms_eps;
-    const float rope_freq_base  = hparams.rope_freq_base;
-    const float rope_freq_scale = hparams.rope_freq_scale;
-
-    auto set_name = [](struct ggml_tensor * t, const char * n) {
-        ggml_set_name(t, n);
-        if (t->grad) {
-            ggml_format_name(t->grad, "%s->grad", n);
-        }
-    };
-
-    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_set_input(KQ_pos);
-
-    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
-                (struct ggml_tensor * t) -> struct ggml_tensor * {
-        // not capturing these, to silcence warnings
-        const int rope_mode = 0;
-
-        return ggml_rope_ext(
-            ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-        );
-    };
-
-    set_name(tokens_input, "tokens_input");
-    set_name(targets,      "targets");
-
-    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
-    struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch);  set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch);
-    struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch);
-
-    struct ggml_tensor * cur = t01;
-
-    std::vector<struct ggml_tensor *> checkpoints;
-    checkpoints.push_back(tokens_input);
-    checkpoints.push_back(targets);
-    checkpoints.push_back(t00);
-    checkpoints.push_back(t01);
-
-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct my_llama_layer & layer = model->layers[il];
-        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, f_norm_rms_eps);                    set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
-        struct ggml_tensor * t03 = ggml_repeat       (ctx, layer.attention_norm, t02);              set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
-        struct ggml_tensor * t04 = ggml_mul          (ctx, t03, t02);                               set_name(t04, "t04");     assert_shape_2d(t04, n_embd, N*n_batch);
-        struct ggml_tensor * t05 = ggml_mul_mat      (ctx, layer.wq, t04);                          set_name(t05, "t05");     assert_shape_2d(t05, n_embd, N*n_batch);
-        struct ggml_tensor * t06 = ggml_reshape_4d   (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06");     assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t07 = rope              (t06);                                         set_name(t07, "t07");     assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t08 = ggml_mul_mat      (ctx, layer.wk, t04);                          set_name(t08, "t08");     assert_shape_2d(t08, n_embd, N*n_batch);
-        struct ggml_tensor * t09 = ggml_reshape_4d   (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09");     assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t10 = rope              (t09);                                         set_name(t10, "t10");     assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t11 = ggml_mul_mat      (ctx, t04, layer.wv);                          set_name(t11, "t11");     assert_shape_2d(t11, N*n_batch, n_embd);
-        struct ggml_tensor * t12 = ggml_reshape_4d   (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12");     assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head);
-        struct ggml_tensor * t13 = ggml_permute      (ctx, t07, 0, 2, 1, 3);                        set_name(t13, "t13");     assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t14 = ggml_permute      (ctx, t10, 0, 2, 1, 3);                        set_name(t14, "t14");     assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch);
-        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                        set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
-        struct ggml_tensor * t16;
-        if (enable_flash_attn) {
-            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
-            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                        set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        } else {
-            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                 set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);          set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past);            set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch);
-            struct ggml_tensor * t16_3 = ggml_soft_max_inplace     (ctx, t16_2);                    set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch);
-            t16 = ggml_mul_mat(ctx, t15, t16_3);                                                    set_name(t16, "t16");     assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
-        }
-        struct ggml_tensor * t17 = ggml_permute      (ctx, t16, 0, 2, 1, 3);                        set_name(t17, "t17");     assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t18 = ggml_cont         (ctx, t17);                                    set_name(t18, "t18");     assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch);
-        struct ggml_tensor * t19 = ggml_reshape_2d   (ctx, t18, n_embd, N*n_batch);                 set_name(t19, "t19");     assert_shape_2d(t19, n_embd, N*n_batch);
-        struct ggml_tensor * t20 = ggml_mul_mat      (ctx, layer.wo, t19);                          set_name(t20, "t20");     assert_shape_2d(t20, n_embd, N*n_batch);
-        struct ggml_tensor * t21 = ggml_add          (ctx, t20, cur);                               set_name(t21, "t21");     assert_shape_2d(t21, n_embd, N*n_batch);
-        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, f_norm_rms_eps);                    set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
-        struct ggml_tensor * t23 = ggml_repeat       (ctx, layer.ffn_norm, t22);                    set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
-        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                               set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, layer.ffn_up, t24);                      set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, layer.ffn_gate, t24);                    set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
-        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                    set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
-        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                               set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, layer.ffn_down, t28);                    set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
-        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                               set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
-        cur = t30;
-        checkpoints.push_back(cur);
-    }
-    struct ggml_tensor * t31   = ggml_rms_norm          (ctx, cur, f_norm_rms_eps);                 set_name(t31, "t31");     assert_shape_2d(t31, n_embd, N*n_batch);
-    struct ggml_tensor * t32   = ggml_repeat            (ctx, model->norm, t31);                    set_name(t32, "t32");     assert_shape_2d(t32, n_embd, N*n_batch);
-    struct ggml_tensor * t33   = ggml_mul               (ctx, t32, t31);                            set_name(t33, "t33");     assert_shape_2d(t33, n_embd, N*n_batch);
-    struct ggml_tensor * t34   = ggml_mul_mat           (ctx, model->output, t33);                  set_name(t34, "t34");     assert_shape_2d(t34, n_vocab, N*n_batch);
-    struct ggml_tensor * t35   = ggml_reshape_3d        (ctx, t34, n_vocab, N, n_batch);            set_name(t35, "t35");     assert_shape_3d(t35, n_vocab, N, n_batch);
-    struct ggml_tensor * t36   = ggml_cross_entropy_loss(ctx, t35, targets);                        set_name(t36, "t36");     assert_shape_1d(t36, 1);
-
-    checkpoints.push_back(t31);
-    checkpoints.push_back(t32);
-    checkpoints.push_back(t33);
-    checkpoints.push_back(t34);
-    checkpoints.push_back(t35);
-    checkpoints.push_back(t36);
-
-    ggml_build_forward_expand(gf, t36);
-
-    if (enable_checkpointing) {
-        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
-    } else {
-        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, true);
-    }
-
-    if (alloc) {
-        // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
-        int n_leafs_before = gb->n_leafs;
-        int n_nodes_before = gb->n_nodes;
-        // output tensors
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
-        // input gradient
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
-        // KQ_pos
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
-        GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-        ggml_set_input(t36->grad);
-
-        // allocating checkpoints in one block to reduce memory fragmentation
-        // note: they will be freed in reverse order
-        for (int i = 0; i < (int) checkpoints.size(); ++i) {
-            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-                ggml_set_input(checkpoints[i]);
-            }
-        }
-
-        //int n_leafs_after = gb->n_leafs;
-        //int n_nodes_after = gb->n_nodes;
-        if (measure_only) {
-            // FIXME: will still allocate
-            ggml_gallocr_reserve(alloc, gb);
-        } else {
-            ggml_gallocr_alloc_graph(alloc, gb);
-
-            if (!measure_only) {
-                int * data = (int *) KQ_pos->data;
-                for (int i = 0; i < N; ++i) {
-                    data[i] = n_past + i;
-                }
-            }
-        }
-
-        // remove the additional nodes and leafs
-        for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
-            gb->leafs[i] = NULL;
-        }
-        for (int i = n_nodes_before; i < gb->n_nodes; ++i) {
-            gb->nodes[i] = NULL;
-        }
-        gb->n_leafs = n_leafs_before;
-        gb->n_nodes = n_nodes_before;
-    }
-
-    *logits = t35;
-    return t36;
-}
-
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-do { \
-    const std::string skey(key); \
-    const int kid = gguf_find_key(ctx, skey.c_str()); \
-    if (kid >= 0) { \
-        enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
-        if (ktype != (type)) { \
-            die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
-        } \
-        (dst) = func(ctx, kid); \
-    } else if (req) { \
-        die_fmt("key not found in model: %s", skey.c_str()); \
-    } \
-} while (0)
-
-static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
-    // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
-    std::string arch;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [&arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch.c_str());
-        return keybuf.data();
-    };
-
-    std::vector<char> tn_buf;
-    tn_buf.resize(GGML_MAX_NAME);
-    auto tn = [&tn_buf](const char * key) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key);
-        return tn_buf.data();
-    };
-    auto tni = [&tn_buf](const char * key, int bid) -> const char * {
-        snprintf(tn_buf.data(), tn_buf.size(), key, bid);
-        std::string s = tn_buf.data();
-        snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str());
-        return tn_buf.data();
-    };
-
-    GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE);
-    GGML_ASSERT(arch == "llama");
-
-    uint32_t ftype_u;
-    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
-    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
-
-    // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
-
-    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
-    GGUF_GET_KEY(fctx, model->hparams.n_head,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
-    GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
-
-    model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head;
-    GGUF_GET_KEY(fctx, model->hparams.n_rot,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
-
-    float rope_freq_scale = 1.0f;
-    GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
-    GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    if (rope_freq_scale != 1.0f) {
-        model->hparams.rope_freq_scale = 1.0f / rope_freq_scale;
-    }
-
-    init_model(model);
-
-    copy_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD));
-    copy_tensor_by_name(model->norm,           f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM));
-    copy_tensor_by_name(model->output,         f_ggml_ctx, tn(LLM_TENSOR_OUTPUT));
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        copy_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i));
-        copy_tensor_by_name(layer.wq,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i));
-        copy_tensor_by_name(layer.wk,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i));
-        copy_tensor_by_name(layer.wv,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
-        copy_tensor_by_name(layer.wo,             f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
-        copy_tensor_by_name(layer.ffn_norm,       f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
-        copy_tensor_by_name(layer.ffn_gate,       f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
-        copy_tensor_by_name(layer.ffn_down,       f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
-        copy_tensor_by_name(layer.ffn_up,         f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
-    }
-}
-
-static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
-    const char * arch = "llama";
-
-    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::vector<char> keybuf;
-    keybuf.resize(512);
-    auto kv = [arch, &keybuf](const char * key) -> const char * {
-        snprintf(keybuf.data(), keybuf.size(), key, arch);
-        return keybuf.data();
-    };
-
-    // set arch
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
-    gguf_set_val_str(fctx, LLM_KV_GENERAL_NAME, arch);
-    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
-
-    // set hparams
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx                  );
-    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd                 );
-    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff                   );
-    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head                 );
-    gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT),                 model->hparams.n_layer                );
-    gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT),        model->hparams.n_rot                  );
-
-    gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps         );
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE),              model->hparams.rope_freq_base         ); // TODO load in llama.cpp
-    gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR),           1.0f / model->hparams.rope_freq_scale );
-
-    // set vocab by copying from vocab_model gguf file
-    {
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ NULL,
-        };
-        struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params);
-
-        const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST));
-        if (token_idx == -1) {
-            die("cannot find tokenizer vocab in model file");
-        }
-        const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx);
-
-        const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES));
-        if (score_idx == -1) {
-            die("cannot find tokenizer scores in model file");
-        }
-
-        const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx);
-
-        const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE));
-        if (toktype_idx == -1) {
-            die("cannot find token type list in GGUF file");
-        }
-
-        const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx);
-
-        std::string tokenizer_name;
-        GGUF_GET_KEY(vctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
-
-        gguf_set_val_str(fctx, kv(LLM_KV_TOKENIZER_MODEL), tokenizer_name.c_str());
-        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_SCORES), GGUF_TYPE_FLOAT32, scores, n_vocab);
-        gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE), GGUF_TYPE_INT32, toktypes, n_vocab);
-
-        int32_t special_bos_id = 1;
-        int32_t special_eos_id = 2;
-        int32_t special_unk_id = 0;
-        int32_t special_sep_id = -1;
-        int32_t special_pad_id = -1;
-        if (tokenizer_name == "llama") {
-            // default special tokens
-            special_bos_id = 1;
-            special_eos_id = 2;
-            special_unk_id = 0;
-            special_sep_id = -1;
-            special_pad_id = -1;
-        } else if (tokenizer_name == "gpt2") {
-            // read and copy bpe merges
-            const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES));
-            if (merges_keyidx == -1) {
-                die("cannot find tokenizer merges in model file");
-            }
-
-            const int n_merges = gguf_get_arr_n(vctx, merges_keyidx);
-
-            std::vector<const char*> merges;
-            merges.resize(n_merges);
-            for (int i = 0; i < n_merges; i++) {
-                merges[i] = gguf_get_arr_str(vctx, merges_keyidx, i);
-            }
-            gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_MERGES), merges.data(), n_merges);
-
-            // default special tokens
-            special_bos_id = 11;
-            special_eos_id = 11;
-            special_unk_id = -1;
-            special_sep_id = -1;
-            special_pad_id = -1;
-        } else {
-            fprintf(stderr, "%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
-            fprintf(stderr, "%s: using default tokenizer: 'llama'", __func__);
-        }
-
-        std::vector<const char*> tokens;
-        tokens.resize(n_vocab);
-        for (uint32_t i = 0; i < n_vocab; i++) {
-            tokens[i] = gguf_get_arr_str(vctx, token_idx, i);
-        }
-        gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_LIST), tokens.data(), n_vocab);
-
-        GGUF_GET_KEY(vctx, special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
-        GGUF_GET_KEY(vctx, special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
-        GGUF_GET_KEY(vctx, special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
-        GGUF_GET_KEY(vctx, special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
-        GGUF_GET_KEY(vctx, special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
-
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_BOS_ID), special_bos_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_EOS_ID), special_eos_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_UNK_ID), special_unk_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_SEP_ID), special_sep_id);
-        gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_PAD_ID), special_pad_id);
-
-        gguf_free(vctx);
-    }
-
-    // add tensors
-    gguf_add_tensor(fctx, model->tok_embeddings);
-    gguf_add_tensor(fctx, model->norm);
-    gguf_add_tensor(fctx, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-
-        gguf_add_tensor(fctx, layer.attention_norm);
-        gguf_add_tensor(fctx, layer.wq);
-        gguf_add_tensor(fctx, layer.wk);
-        gguf_add_tensor(fctx, layer.wv);
-        gguf_add_tensor(fctx, layer.wo);
-        gguf_add_tensor(fctx, layer.ffn_norm);
-        gguf_add_tensor(fctx, layer.ffn_gate);
-        gguf_add_tensor(fctx, layer.ffn_down);
-        gguf_add_tensor(fctx, layer.ffn_up);
-    }
-}
-
-static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_llama_model_gguf(fctx, fn_vocab_model, model);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-static void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct train_state * train) {
-    load_llama_model_gguf(fctx, f_ggml_ctx, model);
-    if (load_train_state_gguf(fctx, f_ggml_ctx, train)) {
-        std::string train_type = LLM_KV_TRAINING_TYPE_TRAIN_MODEL;
-        GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE);
-        GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    } else {
-        printf("%s: loaded llama model as checkpoint\n", __func__);
-    }
-}
-
-static void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
-    gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL);
-    save_llama_model_gguf(fctx, fn_vocab_model, model);
-    save_train_state_gguf(fctx, train);
-}
-
-static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct train_state * train) {
-    struct ggml_context * f_ggml_ctx;
-    struct gguf_init_params params;
-    params.no_alloc = false;
-    params.ctx = &f_ggml_ctx;
-    struct gguf_context * fctx = gguf_init_from_file(filename, params);
-    if (fctx == NULL) {
-        return false;
-    }
-
-    load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
-
-    gguf_free(fctx);
-    return true;
-}
-
-static void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct train_state * train) {
-    printf("%s: saving to %s\n", __func__, filename);
-    struct gguf_context * fctx = gguf_init_empty();
-
-    save_checkpoint_gguf(fctx, fn_vocab_model, model, train);
-
-    // write file
-    const bool only_meta = false;
-    gguf_write_to_file(fctx, filename, only_meta);
-    gguf_free(fctx);
-}
-
-struct train_params {
-    struct train_params_common common;
-
-    const char * fn_vocab_model;
-    const char * fn_model_out;
-
-    bool only_write_model;
-
-    int n_ctx;
-    int n_embd;
-    int n_head;
-    int n_layer;
-    int n_ff;
-
-    float f_norm_rms_eps;
-    float rope_freq_base;
-    float rope_freq_scale;
-};
-
-static struct train_params get_default_train_params() {
-    struct train_params params;
-    params.common = get_default_train_params_common();
-    params.fn_vocab_model    = "ggml-vic7b-uncensored-q4_0.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
-
-    params.only_write_model = false;
-
-    params.n_ctx      =  128;
-    params.n_embd     =  256;
-    params.n_head     =    8;
-    params.n_layer    =   16;
-    params.n_ff       =  768;
-
-    params.f_norm_rms_eps  = 1e-5f;
-    params.rope_freq_base  = 10000.0f;
-    params.rope_freq_scale = 1.0f;
-
-    return params;
-}
-
-static void train_print_usage(int argc, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                 show this help message and exit\n");
-
-    fprintf(stderr, "  --vocab-model FNAME        model path from which to load vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --model-out FNAME          path to save ggml model (default '%s')\n", params->fn_model_out);
-    fprintf(stderr, "  --only-write-model         only save llama model, don't do any training. use this if you only want to convert a checkpoint to a model.\n");
-    fprintf(stderr, "  --embd N                   Embedding size used for new models (default %d)\n", params->n_embd);
-    fprintf(stderr, "  --ff N                     Feedforward size used for new models. (default %d)\n", params->n_ff);
-    fprintf(stderr, "  --head N                   Number of heads for new models (default %d)\n", params->n_head);
-    fprintf(stderr, "  --layer N                  Number of layers for new models (default %d)\n", params->n_layer);
-    fprintf(stderr, "  --norm-rms-eps F           RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps);
-    fprintf(stderr, "  --rope-freq-base F         Frequency base for ROPE (default %f)\n", params->rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale F        Frequency scale for ROPE (default %f)\n", params->rope_freq_scale);
-
-    print_common_train_usage(argc, argv, &params->common);
-}
-
-static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (consume_common_train_arg(argc, argv, &i, &params->common, &invalid_param)) {
-            if (invalid_param) {
-                break;
-            } else if (params->common.print_usage) {
-                train_print_usage(argc, argv, &default_params);
-                exit(0);
-            }
-        } else if (arg == "--vocab-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_vocab_model = argv[i];
-        } else if (arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "--only-write-model") {
-            params->only_write_model = true;
-        } else if (arg == "--embd") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_embd = std::stoi(argv[i]);
-        } else if (arg == "--ff") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_ff = std::stoi(argv[i]);
-        } else if (arg == "--head") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_head = std::stoi(argv[i]);
-        } else if (arg == "--layer") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_layer = std::stoi(argv[i]);
-        } else if (arg == "--norm-rms-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->f_norm_rms_eps = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_base = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->rope_freq_scale = std::stof(argv[i]);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            train_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        train_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    finish_processing_train_args(&params->common);
-
-    return true;
-}
-
-struct save_train_files_data {
-    const char            * fn_checkpoint_out;
-    const char            * fn_model_out;
-    const char            * fn_vocab_model;
-    const char            * pattern_fn_it;
-    const char            * fn_latest;
-    struct my_llama_model * model;
-};
-
-static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
-    int64_t iter = train->opt->iter;
-
-    if (strlen(data->fn_checkpoint_out) > 0) {
-        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model, train);
-        save_checkpoint_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model, train);
-
-    }
-    if (strlen(data->fn_model_out) > 0) {
-        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->fn_vocab_model, data->model);
-        save_llama_model_file(get_train_filename(data->fn_model_out, data->pattern_fn_it, data->fn_latest, -1  ).c_str(), data->fn_vocab_model, data->model);
-    }
-}
-
-static int64_t get_parameter_count(struct my_llama_model* model) {
-    int64_t nx = 0;
-    nx += ggml_nelements(model->tok_embeddings);
-    nx += ggml_nelements(model->norm);
-    nx += ggml_nelements(model->output);
-
-    for (uint32_t i = 0; i < model->layers.size(); ++i) {
-        auto & layer = model->layers[i];
-        nx += ggml_nelements(layer.attention_norm);
-        nx += ggml_nelements(layer.wq);
-        nx += ggml_nelements(layer.wk);
-        nx += ggml_nelements(layer.wv);
-        nx += ggml_nelements(layer.wo);
-        nx += ggml_nelements(layer.ffn_norm);
-        nx += ggml_nelements(layer.ffn_gate);
-        nx += ggml_nelements(layer.ffn_down);
-        nx += ggml_nelements(layer.ffn_up);
-    }
-    return nx;
-}
-
-int main(int argc, char ** argv) {
-    struct train_params params = get_default_train_params();
-
-    if (!train_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    if (params.common.seed == LLAMA_DEFAULT_SEED) {
-        params.common.seed = time(NULL);
-    }
-    printf("%s: seed: %u\n", __func__, params.common.seed);
-    srand(params.common.seed);
-
-    struct llama_model_params mparams = llama_model_default_params();
-    mparams.vocab_only = true;
-
-    struct llama_context_params cparams = llama_context_default_params();
-
-    struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, mparams);
-    struct llama_context * lctx = llama_new_context_with_model(lmodel, cparams);
-
-    struct my_llama_model model;
-    model.hparams.n_vocab = llama_n_vocab(lmodel);
-    model.hparams.n_ctx   = params.common.n_ctx;
-    model.hparams.n_embd  = params.n_embd;
-    model.hparams.n_head  = params.n_head;
-    model.hparams.n_layer = params.n_layer;
-    model.hparams.n_ff    = params.n_ff;
-    // llama.cpp requires n_rot to be exactly n_embd / n_head
-    model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;
-    model.hparams.f_norm_rms_eps  = params.f_norm_rms_eps;
-    model.hparams.rope_freq_base  = params.rope_freq_base;
-    model.hparams.rope_freq_scale = params.rope_freq_scale;
-
-    struct train_state      * train = init_train_state();
-    struct ggml_opt_context * opt   = train->opt;
-
-    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-    opt->params.print_forward_graph     = false;
-    opt->params.print_backward_graph    = false;
-    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
-    opt->params.n_threads               = params.common.n_threads;
-    opt->params.past                    = params.common.opt_past;
-    opt->params.delta                   = params.common.opt_delta;
-    opt->params.max_no_improvement      = params.common.opt_max_no_improvement;
-    opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation;
-    opt->params.adam.n_iter             = params.common.adam_n_iter;
-    opt->params.adam.sched              = 1.0f;
-    opt->params.adam.alpha              = params.common.adam_alpha;
-    opt->params.adam.decay              = params.common.adam_decay;
-    opt->params.adam.decay_min_ndim     = params.common.adam_decay_min_ndim;
-    opt->params.adam.beta1              = params.common.adam_beta1;
-    opt->params.adam.beta2              = params.common.adam_beta2;
-    opt->params.adam.gclip              = params.common.adam_gclip;
-    opt->params.adam.eps_f              = params.common.adam_eps_f;
-
-    printf("%s: init model\n", __func__);
-    bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
-    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
-        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
-        }
-
-        const bool opt_past_changed = opt->params.past != params.common.opt_past;
-
-        if (opt_past_changed) {
-            die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value train from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting");
-            // need to discard previous optimizer past function value statistics and opt_init with new shapes
-            // TODO
-        }
-    } else {
-        init_model(&model);
-        randomize_model(&model, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f);
-        if (!params.only_write_model) {
-            ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&model));
-        }
-    }
-    opt->iter = train->train_its;
-
-    print_params(&model.hparams);
-    printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its);
-    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
-    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
-    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: model_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)), (float) (ggml_used_mem(model.ctx) + ggml_backend_buffer_get_size(model.data)) / (1024.0f*1024.0f));
-
-    if (params.only_write_model) {
-        save_train_files_data save_data;
-        save_data.fn_checkpoint_out = "";
-        save_data.fn_model_out      = params.fn_model_out;
-        save_data.fn_vocab_model    = params.fn_vocab_model;
-        save_data.pattern_fn_it     = params.common.pattern_fn_it;
-        save_data.fn_latest         = params.common.fn_latest;
-        save_data.model             = &model;
-
-        save_train_files(&save_data, train);
-
-        free_train_state(train);
-        ggml_free(model.ctx);
-        llama_free(lctx);
-        llama_free_model(lmodel);
-        return 0;
-    }
-
-    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
-    printf("%s: opt iter %d\n", __func__, opt->iter);
-
-    int n_tokens = model.hparams.n_ctx;
-    int n_vocab  = model.hparams.n_vocab;
-    int n_batch  = params.common.n_batch;
-
-    // context for input tensors without their data
-    struct ggml_init_params ctx_input_params = {
-        ggml_tensor_overhead() * 2, // mem_size
-        NULL,                       // mem_buffer
-        true,                       // no_alloc
-    };
-    struct ggml_context * ctx_input = ggml_init(ctx_input_params);
-
-    // the input tensors
-    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
-    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
-
-    // measure required memory for input tensors
-    // allocate input tensors
-    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
-
-    // context for compute tensors without their data
-    const size_t estimated_compute_size_wo_data = (
-            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
-            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
-    );
-    struct ggml_init_params ctx_compute_params = {
-        estimated_compute_size_wo_data, // mem_size
-        NULL,                           // mem_buffer
-        true,                           // no_alloc
-    };
-    struct ggml_context * ctx_compute = NULL;
-
-    struct ggml_tensor * loss   = NULL;
-    struct ggml_tensor * logits = NULL;
-
-    struct ggml_cgraph * gf     = NULL;
-    struct ggml_cgraph * gb     = NULL;
-    struct ggml_cgraph * gb_tmp = NULL;
-
-    // measure required memory for compute tensors
-    size_t best_compute_size = SIZE_MAX;
-    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
-    // find best evaluation order
-    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
-        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gf->order = (enum ggml_cgraph_eval_order) order;
-        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-        gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-            : NULL;
-        loss = llama_build_train_graphs(
-            &model, alloc, ctx_compute,
-            gf, gb, gb_tmp,
-            &logits, tokens_input, target_probs,
-            n_tokens, n_batch,
-            params.common.use_flash,
-            params.common.use_checkpointing,
-            true
-        );
-        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
-        if (max_compute_size < best_compute_size) {
-            best_compute_size = max_compute_size;
-            best_order = gf->order;
-        }
-        ggml_free(ctx_compute);
-    }
-    size_t max_compute_size = best_compute_size;
-    printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
-    printf("%s: evaluation order = %s\n", __func__,
-        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
-        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
-        "invalid");
-
-    // allocate compute tensors
-    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
-    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gf->order = best_order;
-    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
-    gb_tmp = params.common.use_checkpointing
-        ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
-        : NULL;
-    loss = llama_build_train_graphs(
-        &model, alloc, ctx_compute,
-        gf, gb, gb_tmp,
-        &logits, tokens_input, target_probs,
-        n_tokens, n_batch,
-        params.common.use_flash,
-        params.common.use_checkpointing,
-        false
-    );
-
-    std::vector<llama_token> train_tokens;
-    std::vector<size_t> train_samples_begin;
-    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data\n", __func__);
-    tokenize_file(lctx,
-            params.common.fn_train_data,
-            params.common.sample_start,
-            params.common.include_sample_start,
-            params.common.overlapping_samples,
-            n_tokens,
-            train_tokens,
-            train_samples_begin,
-            train_samples_size);
-    GGML_ASSERT(train_samples_begin.size() == train_samples_size.size());
-
-    printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size());
-
-    size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size());
-    const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size());
-    if (changed_train_data) {
-        printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__);
-    }
-    if (params.common.force_reshuffle) {
-        printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__);
-    }
-    if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) {
-        train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed);
-        train->shuffle_sample_count = train_samples_size.size();
-        train->shuffle_next_sample = 0;
-        train->shuffle_samples_hash = shuffle_samples_hash;
-    }
-    std::vector<size_t> train_shuffled_samples_offs;
-    std::vector<size_t> train_shuffled_samples_begin;
-    std::vector<size_t> train_shuffled_samples_size;
-    train_shuffled_samples_offs.resize(train_samples_begin.size());
-    train_shuffled_samples_begin.resize(train_samples_begin.size());
-    train_shuffled_samples_size.resize(train_samples_size.size());
-    train->shuffle_rng_state_next = shuffle_samples(
-        train->shuffle_rng_state_current,
-        train_shuffled_samples_offs.data(),
-        train_shuffled_samples_begin.data(),
-        train_shuffled_samples_size.data(),
-        train_samples_begin.data(),
-        train_samples_size.data(),
-        train_samples_size.size());
-    printf("%s: begin training\n", __func__);
-
-    save_train_files_data save_data;
-    save_data.fn_checkpoint_out = params.common.fn_checkpoint_out;
-    save_data.fn_model_out      = params.fn_model_out;
-    save_data.fn_vocab_model    = params.fn_vocab_model;
-    save_data.pattern_fn_it     = params.common.pattern_fn_it;
-    save_data.fn_latest         = params.common.fn_latest;
-    save_data.model             = &model;
-
-    struct train_opt_callback_data opt_cb_data;
-    opt_cb_data.params                 = &params.common;
-    opt_cb_data.train                  = train;
-    opt_cb_data.save_cb                = &save_train_files;
-    opt_cb_data.save_data              = &save_data;
-    opt_cb_data.lctx                   = lctx;
-    opt_cb_data.last_save_iter         = opt->iter;
-    opt_cb_data.tokens_data            = train_tokens.data();
-    opt_cb_data.tokens_size            = train_tokens.size();
-    opt_cb_data.samples_begin          = train_samples_begin.data();
-    opt_cb_data.samples_size           = train_samples_size.data();
-    opt_cb_data.shuffled_samples_offs  = train_shuffled_samples_offs.data();
-    opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data();
-    opt_cb_data.shuffled_samples_size  = train_shuffled_samples_size.data();
-    opt_cb_data.samples_count          = train_samples_size.size();
-    opt_cb_data.tokens_input           = tokens_input;
-    opt_cb_data.target_probs           = target_probs;
-    opt_cb_data.first_iter             = opt->iter;
-    opt_cb_data.first_epoch            = train->train_epochs;
-    opt_cb_data.iter_at_last_epoch     = -1;
-    opt_cb_data.last_time              = ggml_time_ms();
-    opt_cb_data.millis_per_iter        = 0.0;
-
-    // measure required memory for work buffer
-    size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE;
-    printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f));
-
-    // context for work buffer
-    struct ggml_init_params ctx_work_params = {
-        max_work_size, // mem_size
-        NULL,          // mem_buffer
-        false,         // no_alloc
-    };
-    struct ggml_context * ctx_work = ggml_init(ctx_work_params);
-
-    int64_t t0 = ggml_time_ms();
-
-    ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data);
-
-    ggml_free(ctx_work);
-    ggml_free(ctx_compute);
-    ggml_free(ctx_input);
-
-    int64_t t1 = ggml_time_ms();
-    printf("%s: total training time: ", __func__);
-    print_duration((double) (t1 - t0));
-    printf("\n");
-
-    int new_iters = opt->iter - opt_cb_data.last_save_iter;
-    if (new_iters > 0) {
-        train->train_its     += new_iters;
-        train->train_tokens  += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens;
-
-        save_train_files(&save_data, train);
-        opt_cb_data.last_save_iter = opt->iter;
-    }
-
-    ggml_free(opt->ctx);
-    free_train_state(train);
-    ggml_free(model.ctx);
-    llama_free(lctx);
-    llama_free_model(lmodel);
-    return 0;
-}
diff --git a/flake.lock b/flake.lock
index 3e5125dde..940cda6a4 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1720768451,
-        "narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
+        "lastModified": 1721379653,
+        "narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
+        "rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
         "type": "github"
       },
       "original": {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 2fdb9fa40..548661b9b 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2400,6 +2400,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
     GGML_API int ggml_cpu_has_cann       (void);
+    GGML_API int ggml_cpu_has_llamafile  (void);
 
     //
     // Internal types and functions exposed for tests and benchmarks
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 3f4c66bf7..c6496c921 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -467,15 +467,18 @@ if (GGML_SYCL)
         message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
     endif()
 
-    if ( NOT DEFINED ENV{ONEAPI_ROOT})
-        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
+    if ( DEFINED ENV{ONEAPI_ROOT})
+        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
+    elseif(SUPPORTS_SYCL)
+        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
+         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
+         source /opt/intel/oneapi/setvars.sh")
+    else()
+        message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
     endif()
-    #todo: AOT
-
-    find_package(IntelSYCL REQUIRED)
-    find_package(MKL REQUIRED)
-
     message(STATUS "SYCL found")
+    #todo: AOT
 
     list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
 
@@ -487,11 +490,9 @@ if (GGML_SYCL)
         add_compile_definitions(GGML_SYCL_FORCE_MMQ)
     endif()
 
-    add_compile_options(-I./) #include DPCT
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
 
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
     if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
         add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
     else()
         add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
@@ -504,14 +505,14 @@ if (GGML_SYCL)
     list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
 
     if (WIN32)
+        find_package(IntelSYCL REQUIRED)
+        find_package(MKL REQUIRED)
         set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
     else()
-        add_compile_options(-I/${SYCL_INCLUDE_DIR})
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
-
         if (GGML_SYCL_TARGET STREQUAL "INTEL")
             set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
         elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
             set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
         endif()
     endif()
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 26535b1c4..af53dea17 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
@@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
                 "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
-#elif defined(__ARM_NEON) && defined(__aarch64__)
+#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
     }
 #endif
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     const void * b_ptr = vx;
     const void * a_ptr = vy;
     float * res_ptr = s;
@@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
     if (svcntw() == 8) {
         const void * b_ptr = vx;
         const void * a_ptr = vy;
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 26d9412a2..1c2e00c1e 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -459,7 +459,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
 
 static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
     c = __builtin_amdgcn_sdot4(a, b, c, false);
 #elif defined(RDNA3)
     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 3af2eec2f..84f6387e2 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
         case GGML_TYPE_Q6_K:
             mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
             break;
+        case GGML_TYPE_IQ2_XXS:
+            mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ2_XS:
+            mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ2_S:
+            mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ3_XXS:
+            mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ3_S:
+            mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
+            break;
+        case GGML_TYPE_IQ1_S:
+            mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
+            break;
         case GGML_TYPE_IQ4_XS:
             mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
             break;
@@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ1_S:
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ4_NL:
             mmq_supported = true;
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 51c44d857..f08a4758d 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -63,6 +63,14 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
         case GGML_TYPE_Q5_K:
             return MMQ_Q8_1_DS_LAYOUT_DS4;
         case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+            return MMQ_Q8_1_DS_LAYOUT_D4;
+        case GGML_TYPE_IQ1_S:
+            return MMQ_Q8_1_DS_LAYOUT_DS4;
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ4_NL:
             return MMQ_Q8_1_DS_LAYOUT_D4;
@@ -131,15 +139,16 @@ static constexpr __device__ int get_mmq_y_device() {
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 }
 
-#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_0   + mmq_y/QI4_0,     0}
-#define MMQ_DP4A_TXS_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_1   + mmq_y/QI4_1,     0}
-#define MMQ_DP4A_TXS_Q8_0 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_0 + mmq_y/(QI8_0/2), 0}
-#define MMQ_DP4A_TXS_Q8_1 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_1 + mmq_y/(QI8_1/2), 0}
-#define MMQ_DP4A_TXS_Q2_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE         + mmq_y,           0}
-#define MMQ_DP4A_TXS_Q3_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y,                                     mmq_y*WARP_SIZE/8 + mmq_y/8}
-#define MMQ_DP4A_TXS_Q4_K tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_K   + mmq_y/QI4_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
-#define MMQ_DP4A_TXS_Q5_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K   + mmq_y/QI5_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
-#define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K   + mmq_y/QI6_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define MMQ_DP4A_TXS_Q4_0    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_0   + mmq_y/QI4_0,     0}
+#define MMQ_DP4A_TXS_Q4_1    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_1   + mmq_y/QI4_1,     0}
+#define MMQ_DP4A_TXS_Q8_0    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_0 + mmq_y/(QI8_0/2), 0}
+#define MMQ_DP4A_TXS_Q8_0_16 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*4/QI8_0 + mmq_y/(QI8_0/4), 0}
+#define MMQ_DP4A_TXS_Q8_1    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE*2/QI8_1 + mmq_y/(QI8_1/2), 0}
+#define MMQ_DP4A_TXS_Q2_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE         + mmq_y,           0}
+#define MMQ_DP4A_TXS_Q3_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y,                                     mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define MMQ_DP4A_TXS_Q4_K    tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_K,                     mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define MMQ_DP4A_TXS_Q5_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K   + mmq_y/QI5_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define MMQ_DP4A_TXS_Q6_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K   + mmq_y/QI6_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
 
 static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
     return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 :
@@ -152,42 +161,46 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
         type == GGML_TYPE_Q4_K    ? MMQ_DP4A_TXS_Q4_K :
         type == GGML_TYPE_Q5_K    ? MMQ_DP4A_TXS_Q5_K :
         type == GGML_TYPE_Q6_K    ? MMQ_DP4A_TXS_Q6_K :
+        type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ2_XS  ? MMQ_DP4A_TXS_Q8_0_16 :
+        type == GGML_TYPE_IQ2_S   ? MMQ_DP4A_TXS_Q8_0_16 :
+        type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ3_S   ? MMQ_DP4A_TXS_Q8_0 :
+        type == GGML_TYPE_IQ1_S   ? MMQ_DP4A_TXS_Q8_0 :
         type == GGML_TYPE_IQ4_XS  ? MMQ_DP4A_TXS_Q8_0 :
         type == GGML_TYPE_IQ4_NL  ? MMQ_DP4A_TXS_Q8_0 :
         tile_x_sizes{0, 0, 0};
 }
 
-#define MMQ_MMA_TILE_X_K_Q4_0 (1*WARP_SIZE + WARP_SIZE/QI4_0                   + 4)
-#define MMQ_MMA_TILE_X_K_Q4_1 (1*WARP_SIZE + WARP_SIZE/QI4_1                   + 4)
 #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0                 + 4)
 #define MMQ_MMA_TILE_X_K_Q8_1 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0                 + 4)
 #define MMQ_MMA_TILE_X_K_Q2_K (2*WARP_SIZE + WARP_SIZE                         + 4)
-#define MMQ_MMA_TILE_X_K_Q3_K (2*WARP_SIZE + WARP_SIZE/(2*QI3_K) + WARP_SIZE/8 + 7)
-#define MMQ_MMA_TILE_X_K_Q4_K (1*WARP_SIZE + WARP_SIZE/QI4_K     + WARP_SIZE/8 + 7)
-#define MMQ_MMA_TILE_X_K_Q5_K (2*WARP_SIZE + WARP_SIZE/QI5_K     + WARP_SIZE/8 + 7)
+#define MMQ_MMA_TILE_X_K_Q3_K (2*WARP_SIZE + WARP_SIZE/2                       + 4)
 #define MMQ_MMA_TILE_X_K_Q6_K (2*WARP_SIZE + WARP_SIZE/QI6_K     + WARP_SIZE/8 + 7)
 
-static_assert(MMQ_MMA_TILE_X_K_Q4_0 % 8 == 4, "Wrong padding.");
-static_assert(MMQ_MMA_TILE_X_K_Q4_1 % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q8_0 % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q2_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
-static_assert(MMQ_MMA_TILE_X_K_Q4_K % 8 == 4, "Wrong padding.");
-static_assert(MMQ_MMA_TILE_X_K_Q5_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
 
 static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q4_0 :
-        type == GGML_TYPE_Q4_1    ? MMQ_MMA_TILE_X_K_Q4_1 :
+    return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_Q4_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
         type == GGML_TYPE_Q5_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
         type == GGML_TYPE_Q5_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
         type == GGML_TYPE_Q8_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
         type == GGML_TYPE_Q2_K    ? MMQ_MMA_TILE_X_K_Q2_K :
         type == GGML_TYPE_Q3_K    ? MMQ_MMA_TILE_X_K_Q3_K :
-        type == GGML_TYPE_Q4_K    ? MMQ_MMA_TILE_X_K_Q4_K :
-        type == GGML_TYPE_Q5_K    ? MMQ_MMA_TILE_X_K_Q5_K :
+        type == GGML_TYPE_Q4_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        type == GGML_TYPE_Q5_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
         type == GGML_TYPE_Q6_K    ? MMQ_MMA_TILE_X_K_Q6_K :
+        type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ2_XS  ? MMQ_MMA_TILE_X_K_Q3_K :
+        type == GGML_TYPE_IQ2_S   ? MMQ_MMA_TILE_X_K_Q3_K :
+        type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ3_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
+        type == GGML_TYPE_IQ1_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
         type == GGML_TYPE_IQ4_XS  ? MMQ_MMA_TILE_X_K_Q8_0 :
         type == GGML_TYPE_IQ4_NL  ? MMQ_MMA_TILE_X_K_Q8_0 :
         0;
@@ -216,7 +229,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
 #ifdef INT8_MMA_AVAILABLE
     int   * x_qs = (int   *)  x_tile;
-    float * x_df = (float *) (x_qs + WARP_SIZE);
+    float * x_df = (float *) (x_qs + 2*WARP_SIZE);
 #else
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_0, mmq_y);
     int   * x_qs = (int   *)  x_tile;
@@ -235,11 +248,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         }
 
         const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
+        const int qs0 = get_int_b2(bxi->qs, kqsx);
 
 #ifdef INT8_MMA_AVAILABLE
-        x_qs[i*MMQ_MMA_TILE_X_K_Q4_0 + threadIdx.x] = get_int_b2(bxi->qs, kqsx);
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + 0]     = __vsubss4((qs0 >> 0) & 0x0F0F0F0F, 0x08080808);
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + kbx*(2*QI4_0) + kqsx + QI4_0] = __vsubss4((qs0 >> 4) & 0x0F0F0F0F, 0x08080808);
 #else
-        x_qs[i*(WARP_SIZE + 1)       + threadIdx.x] = get_int_b2(bxi->qs, kqsx);
+        x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
 #endif // INT8_MMA_AVAILABLE
     }
 
@@ -257,7 +272,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
 
 #ifdef INT8_MMA_AVAILABLE
-        x_df[i*MMQ_MMA_TILE_X_K_Q4_0       + kbxd] = bxi->d;
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0       + kbxd] = bxi->d;
 #else
         x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + kbxd] = bxi->d;
 #endif // INT8_MMA_AVAILABLE
@@ -304,95 +319,12 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
     }
 }
 
-template <int mmq_x, int mmq_y, int nwarps>
-static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mma(
-    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
-#ifdef INT8_MMA_AVAILABLE
-
-    typedef mma_int_A_I16K8 mma_A;
-    typedef mma_int_B_J8K8  mma_B;
-    typedef mma_int_C_I16J8 mma_C;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = 2 * granularity;
-    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const float * x_df = (const float *) x_qs + WARP_SIZE;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
-
-    mma_A A[ntx][4];
-    float dA[ntx][mma_C::ne/2][4];
-
-    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
-
-#pragma unroll
-    for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_0*QI4_0) {
-            const int k0 = k00 + k01;
-
-#pragma unroll
-            for (int l = 0; l < mma_A::ne; ++l) {
-                const int i     = i0 + n*mma_A::I + mma_A::get_i(l);
-                const int k     = k0/QR4_0        + mma_A::get_k(l) % QI4_0;
-                const int shift =                4*(mma_A::get_k(l) / QI4_0);
-
-                A[n][k01/(QR4_0*QI4_0)].x[l] = __vsubss4((x_qs[i*MMQ_MMA_TILE_X_K_Q4_0 + k] >> shift) & 0x0F0F0F0F, 0x08080808);
-            }
-
-#pragma unroll
-            for (int l = 0; l < mma_C::ne/2; ++l) {
-                const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
-
-                dA[n][l][k01/(QR4_0*QI4_0)] = x_df[i*MMQ_MMA_TILE_X_K_Q4_0 + k0/(QR4_0*QI4_0)];
-            }
-        }
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_0*QI4_0) {
-            mma_B  B;
-            float dB[mma_C::ne/2];
-
-            B.load(y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
-
-#pragma unroll
-            for (int l = 0; l < mma_C::ne/2; ++l) {
-                const int j = j0 + mma_C::get_j(l);
-
-                dB[l] = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
-            }
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                mma_C C;
-                C.mma_K8(A[n][k01/(QR4_0*QI4_0)], B);
-
-#pragma unroll
-                for (int l = 0; l < mma_C::ne; ++l) {
-                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dA[n][l/2][k01/(QR4_0*QI4_0)]*dB[l%2]*C.x[l];
-                }
-            }
-        }
-    }
-#else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
-    NO_DEVICE_CODE;
-#endif // INT8_MMA_AVAILABLE
-}
-
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
     const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
 
 #ifdef INT8_MMA_AVAILABLE
     int   * x_qs = (int   *)  x_tile;
-    half2 * x_dm = (half2 *) (x_qs + WARP_SIZE);
+    half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
 #else
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_1, mmq_y);
     int   * x_qs = (int   *)  x_tile;
@@ -411,11 +343,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         }
 
         const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
+        const int qs0 = get_int_b4(bxi->qs, kqsx);
 
 #ifdef INT8_MMA_AVAILABLE
-        x_qs[i*MMQ_MMA_TILE_X_K_Q4_1 + threadIdx.x] = get_int_b4(bxi->qs, kqsx);
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + 0]     = (qs0 >> 0) & 0x0F0F0F0F;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kbx*(2*QI4_1) + kqsx + QI4_1] = (qs0 >> 4) & 0x0F0F0F0F;
 #else
-        x_qs[i*(WARP_SIZE + 1)       + threadIdx.x] = get_int_b4(bxi->qs, kqsx);
+        x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
 #endif // INT8_MMA_AVAILABLE
     }
 
@@ -433,7 +367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
 
 #ifdef INT8_MMA_AVAILABLE
-        x_dm[i*MMQ_MMA_TILE_X_K_Q4_1       + kbxd] = bxi->dm;
+        x_dm[i*MMQ_MMA_TILE_X_K_Q8_1       + kbxd] = bxi->dm;
 #else
         x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + kbxd] = bxi->dm;
 #endif // INT8_MMA_AVAILABLE
@@ -480,88 +414,6 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
     }
 }
 
-template <int mmq_x, int mmq_y, int nwarps>
-static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mma(
-    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
-#ifdef INT8_MMA_AVAILABLE
-
-    typedef mma_int_A_I16K8 mma_A;
-    typedef mma_int_A_I16K4 mma_A_K4;
-    typedef mma_int_B_J8K8  mma_B;
-    typedef mma_int_C_I16J8 mma_C;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = 2 * granularity;
-    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + WARP_SIZE;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
-
-    mma_A A[ntx][4];
-    half2 dmA[ntx][mma_C::ne/2][4];
-
-    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
-
-#pragma unroll
-    for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_1*QI4_1) {
-            const int k0 = k00 + k01;
-
-            A[n][k01/(QR4_1*QI4_1)].load_low(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q4_1 + k0/QR4_1, MMQ_MMA_TILE_X_K_Q4_1);
-            A[n][k01/(QR4_1*QI4_1)].x[2]  = (A[n][k01/(QR4_1*QI4_1)].x[0] >> 4) & 0x0F0F0F0F;
-            A[n][k01/(QR4_1*QI4_1)].x[3]  = (A[n][k01/(QR4_1*QI4_1)].x[1] >> 4) & 0x0F0F0F0F;
-            A[n][k01/(QR4_1*QI4_1)].x[0] &= 0x0F0F0F0F;
-            A[n][k01/(QR4_1*QI4_1)].x[1] &= 0x0F0F0F0F;
-
-#pragma unroll
-            for (int l = 0; l < mma_C::ne/2; ++l) {
-                const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
-
-                dmA[n][l][k01/(QR4_1*QI4_1)] = x_dm[i*MMQ_MMA_TILE_X_K_Q4_1 + k0/(QR4_1*QI4_1)];
-            }
-        }
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += QR4_1*QI4_1) {
-            mma_B B;
-            half2 dsB[mma_C::ne/2];
-
-            B.load(y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
-
-#pragma unroll
-            for (int l = 0; l < mma_C::ne/2; ++l) {
-                const int j = j0 + mma_C::get_j(l);
-
-                dsB[l] = y_ds[j*MMQ_TILE_Y_K + k01/QI8_1];
-            }
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                mma_C C;
-                C.mma_K8(A[n][k01/(QR4_1*QI4_1)], B);
-
-#pragma unroll
-                for (int l = 0; l < mma_C::ne; ++l) {
-                    const half2 dmA_dsB = dmA[n][l/2][k01/(QR4_1*QI4_1)]*dsB[l%2];
-                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += __low2float(dmA_dsB)*C.x[l] + __high2float(dmA_dsB);
-                }
-            }
-        }
-    }
-#else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
-    NO_DEVICE_CODE;
-#endif // INT8_MMA_AVAILABLE
-}
-
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
     const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
 
@@ -789,10 +641,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
     }
 }
 
-template <int mmq_x, int mmq_y, int nwarps>
+template <int mmq_x, int mmq_y, int nwarps, mmq_q8_1_ds_layout ds_layout>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
-#ifdef INT8_MMA_AVAILABLE
 
     typedef mma_int_A_I16K8 mma_A;
     typedef mma_int_B_J8K8  mma_B;
@@ -808,6 +659,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
     const float * x_df = (const float *) x_qs + 2*WARP_SIZE;
     const int   * y_qs = (const int   *) y + 4;
     const float * y_df = (const float *) y;
+    const half2 * y_ds = (const half2 *) y;
 
     mma_A A[ntx][WARP_SIZE/QI8_0];
     float dA[ntx][mma_C::ne/2][WARP_SIZE/QI8_0];
@@ -840,18 +692,20 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
     for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
 #pragma unroll
         for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
-            const int k0 = k00 + k01;
-
-            mma_B B;
+            mma_B  B;
             float dB[mma_C::ne/2];
 
-            B.load(y_qs + j0*MMQ_TILE_Y_K + k0 % WARP_SIZE, MMQ_TILE_Y_K);
+            B.load(y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
 
 #pragma unroll
             for (int l = 0; l < mma_C::ne/2; ++l) {
                 const int j = j0 + mma_C::get_j(l);
 
-                dB[l] = y_df[j*MMQ_TILE_Y_K + (k0/QI8_1) % (WARP_SIZE/QI8_1)];
+                if (ds_layout == MMQ_Q8_1_DS_LAYOUT_D4) {
+                    dB[l] =             y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
+                } else {
+                    dB[l] = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+                }
             }
 
 #pragma unroll
@@ -866,10 +720,6 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
             }
         }
     }
-#else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
-    NO_DEVICE_CODE;
-#endif // INT8_MMA_AVAILABLE
 }
 
 template <int mmq_x, int mmq_y, int nwarps>
@@ -905,7 +755,6 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
-#ifdef INT8_MMA_AVAILABLE
 
     typedef mma_int_A_I16K8 mma_A;
     typedef mma_int_B_J8K8  mma_B;
@@ -922,8 +771,8 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
     const int   * y_qs = (const int   *) y + 4;
     const half2 * y_dm = (const half2 *) y;
 
-    mma_A   A[ntx][WARP_SIZE/QI8_1];
-    half2 dmA[ntx][mma_C::ne/2][WARP_SIZE/QI8_1];
+    mma_A    A[ntx][WARP_SIZE/QI8_1];
+    float2 dmA[ntx][mma_C::ne/2][WARP_SIZE/QI8_1];
 
     const int i0 = (threadIdx.y/ntx)*rows_per_warp;
 
@@ -944,7 +793,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
             for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
                 const int k0 = k00 + k01;
 
-                dmA[n][l][k01/QI8_1] = x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1];
+                dmA[n][l][k01/QI8_1] = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]);
             }
         }
     }
@@ -953,18 +802,16 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
     for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
 #pragma unroll
         for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_1) {
-            const int k0 = k00 + k01;
+            mma_B    B;
+            float2 dsB[mma_C::ne/2];
 
-            mma_B   B;
-            half2 dsB[mma_C::ne/2];
-
-            B.load(y_qs + j0*MMQ_TILE_Y_K + k0 % WARP_SIZE, MMQ_TILE_Y_K);
+            B.load(y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
 
 #pragma unroll
             for (int l = 0; l < mma_C::ne/2; ++l) {
                 const int j = j0 + mma_C::get_j(l);
 
-                dsB[l] = y_dm[j*MMQ_TILE_Y_K + (k0/QI8_1) % (WARP_SIZE/QI8_1)];
+                dsB[l] = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
             }
 
 #pragma unroll
@@ -974,8 +821,120 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
 
 #pragma unroll
                 for (int l = 0; l < mma_C::ne; ++l) {
-                const half2 dmA_dsB = dmA[n][l/2][k01/QI8_1]*dsB[l%2];
-                sum[(j0/mma_C::J + n)*mma_C::ne + l] += __low2float(dmA_dsB)*C.x[l] + __high2float(dmA_dsB);
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dmA[n][l/2][k01/QI8_1].x*dsB[l%2].x*C.x[l];
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dmA[n][l/2][k01/QI8_1].y*dsB[l%2].y;
+                }
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+
+    constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + txs.qs;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+// #pragma unroll
+    for (int k01 = 0; k01 < WARP_SIZE; k01 += QI8_0) {
+        const int k0 = k00 + k01;
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+            const int j = j0 + threadIdx.y;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+                const int i = i0 + threadIdx.x;
+
+                sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_16_q8_1_impl<QI8_0>(
+                    &x_qs[i*(2*WARP_SIZE + 1) + k0],
+                    &y_qs[j*MMQ_TILE_Y_K + k01],
+                    &x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + k0/(QI8_0/2)],
+                    y_df[j*MMQ_TILE_Y_K + k01/QI8_1]);
+            }
+        }
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
+#ifdef INT8_MMA_AVAILABLE
+
+    typedef mma_int_A_I16K4 mma_A;
+    typedef mma_int_A_I16K8 mma_A_K8;
+    typedef mma_int_B_J8K4  mma_B;
+    typedef mma_int_C_I16J8 mma_C;
+
+    constexpr int granularity = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = 2 * granularity;
+    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
+
+    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + WARP_SIZE*2;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
+
+    mma_A   A[ntx][8];
+    float  dA[ntx][mma_C::ne/2][8];
+
+#pragma unroll
+    for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
+            const int k0 = k00 + k01;
+
+            ((mma_A_K8 *) A[n])[k01/8].load(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
+        }
+
+#pragma unroll
+        for (int l = 0; l < mma_C::ne/2; ++l) {
+            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
+
+#pragma unroll
+            for (int k01 = 0; k01 < WARP_SIZE; k01 += 4) {
+                const int k0 = k00 + k01;
+
+                dA[n][l][k01/4] = x_df[i*MMQ_MMA_TILE_X_K_Q3_K + k0/4];
+            }
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
+#pragma unroll
+        for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
+            mma_B B[2];
+            float dB[mma_C::ne/2];
+
+            B[0].load(y_qs + j0*MMQ_TILE_Y_K + (k01 + 0),        MMQ_TILE_Y_K);
+            B[1].load(y_qs + j0*MMQ_TILE_Y_K + (k01 + mma_B::K), MMQ_TILE_Y_K);
+
+#pragma unroll
+            for (int l = 0; l < mma_C::ne/2; ++l) {
+                const int j = j0 + mma_C::get_j(l);
+
+                dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                mma_C C[2];
+                C[0].mma_K4(A[n][k01/4 + 0], B[0]);
+                C[1].mma_K4(A[n][k01/4 + 1], B[1]);
+
+#pragma unroll
+                for (int l = 0; l < mma_C::ne; ++l) {
+                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dB[l%2]*(C[0].x[l]*dA[n][l/2][k01/4 + 0] + C[1].x[l]*dA[n][l/2][k01/4 + 1]);
                 }
             }
         }
@@ -1222,7 +1181,6 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 #ifdef INT8_MMA_AVAILABLE
     int   * x_qs = (int   *)  x_tile;
     float * x_df = (float *) (x_qs + WARP_SIZE*2);
-    int   * x_sc = (int   *) (x_df + 1);
 #else
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q3_K, mmq_y);
     int   * x_qs = (int   *)  x_tile;
@@ -1262,23 +1220,6 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         }
     }
 
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*WARP_SIZE) {
-        int i = (i0 + threadIdx.y*WARP_SIZE + threadIdx.x) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
-
-#ifdef INT8_MMA_AVAILABLE
-        x_df[i*MMQ_MMA_TILE_X_K_Q3_K] = bxi->d;
-#else
-        x_df[i]                       = bxi->d;
-#endif // INT8_MMA_AVAILABLE
-    }
-
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) {
         int i = i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8);
@@ -1302,11 +1243,32 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
 
 #ifdef INT8_MMA_AVAILABLE
-        x_sc[i*MMQ_MMA_TILE_X_K_Q3_K + threadIdx.x % (WARP_SIZE/8)] = sc;
+        const int8_t * sc8 = (const int8_t *) &sc;
+        const float d = bxi->d;
+
+#pragma unroll
+        for (int l = 0; l < sizeof(int); ++l) {
+            x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
+        }
 #else
-        x_sc[i*(WARP_SIZE/8) + i/8   + threadIdx.x % (WARP_SIZE/8)] = sc;
+        x_sc[i*(WARP_SIZE/8) + i/8 + threadIdx.x % (WARP_SIZE/8)] = sc;
 #endif // INT8_MMA_AVAILABLE
     }
+
+#ifndef INT8_MMA_AVAILABLE
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*WARP_SIZE) {
+        int i = (i0 + threadIdx.y*WARP_SIZE + threadIdx.x) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride;
+
+        x_df[i] = bxi->d;
+    }
+#endif // INT8_MMA_AVAILABLE
 }
 
 template <int mmq_x, int mmq_y, int nwarps>
@@ -1342,99 +1304,14 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_dp4a(
     }
 }
 
-template <int mmq_x, int mmq_y, int nwarps>
-static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mma(
-    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
-#ifdef INT8_MMA_AVAILABLE
-
-    typedef mma_int_A_I16K4 mma_A;
-    typedef mma_int_A_I16K8 mma_A_K8;
-    typedef mma_int_B_J8K4  mma_B;
-    typedef mma_int_C_I16J8 mma_C;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = 2 * granularity;
-    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const float * x_df = (const float *) x_qs + WARP_SIZE*2;
-    const int   * x_sc = (const int   *) x_df + 1;
-    const int   * y_qs = (const int   *) y + 4;
-    const float * y_df = (const float *) y;
-
-    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
-
-    mma_A   A[ntx][8];
-    int   scA[ntx][mma_C::ne/2][8];
-    float  dA[ntx][mma_C::ne/2];
-
-#pragma unroll
-    for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
-            const int k0 = k00 + k01;
-
-            ((mma_A_K8 *) A[n])[k01/8].load(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q3_K + k0, MMQ_MMA_TILE_X_K_Q3_K);
-        }
-
-#pragma unroll
-        for (int l = 0; l < mma_C::ne/2; ++l) {
-            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
-
-#pragma unroll
-            for (int k01 = 0; k01 < WARP_SIZE; k01 += 16) {
-                const int k0 = k00 + k01;
-
-                const int sc_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q3_K + k0/16];
-                const int8_t * sc = (const int8_t *) &sc_packed;
-
-#pragma unroll
-                for (int ksc = 0; ksc < sizeof(int); ++ksc) {
-                    scA[n][l][k01/4 + ksc] = sc[ksc];
-                }
-            }
-
-            dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q3_K];
-        }
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += QR3_K*VDR_Q3_K_Q8_1_MMQ) {
-            mma_B B[2];
-            float dB[mma_C::ne/2];
-
-            B[0].load(y_qs + j0*MMQ_TILE_Y_K + (k01 + 0),        MMQ_TILE_Y_K);
-            B[1].load(y_qs + j0*MMQ_TILE_Y_K + (k01 + mma_B::K), MMQ_TILE_Y_K);
-
-#pragma unroll
-            for (int l = 0; l < mma_C::ne/2; ++l) {
-                const int j = j0 + mma_C::get_j(l);
-
-                dB[l] = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
-            }
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                mma_C C[2];
-                C[0].mma_K4(A[n][k01/4 + 0], B[0]);
-                C[1].mma_K4(A[n][k01/4 + 1], B[1]);
-
-#pragma unroll
-                for (int l = 0; l < mma_C::ne; ++l) {
-                    sum[(j0/mma_C::J + n)*mma_C::ne + l] += dA[n][l/2]*dB[l%2]*
-                        (C[0].x[l]*scA[n][l/2][k01/4 + 0] + C[1].x[l]*scA[n][l/2][k01/4 + 1]);
-                }
-            }
-        }
-    }
-#else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
-    NO_DEVICE_CODE;
-#endif // INT8_MMA_AVAILABLE
+static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, const int ksc) {
+    // scale arrangement after the following two lines:
+    //   - ksc == 0: sc0, sc1, sc2, sc3
+    //   - ksc == 1: sc4, sc5, sc6, sc7
+    //   - ksc == 2:  m0,  m1,  m2,  m3
+    //   - ksc == 3:  m4,  m5,  m6,  m7
+    return ((scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F) | // lower 4 bits
+           ((scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030);  // upper 2 bits
 }
 
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
@@ -1442,8 +1319,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
 #ifdef INT8_MMA_AVAILABLE
     int   * x_qs = (int   *)  x_tile;
-    half2 * x_dm = (half2 *) (x_qs + WARP_SIZE);
-    int   * x_sc = (int   *) (x_dm + WARP_SIZE/QI4_K);
+    half2 * x_dm = (half2 *) (x_qs + 2*WARP_SIZE);
 #else
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q4_K, mmq_y);
     int   * x_qs = (int   *)  x_tile;
@@ -1451,9 +1327,6 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
     int   * x_sc = (int   *) (x_dm + txs.dm);
 #endif // INT8_MMA_AVAILABLE
 
-    const int kbx  = 0;           // threadIdx.x / QI4_K
-    const int kqsx = threadIdx.x; // threadIdx.x % QI4_K
-
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + threadIdx.y;
@@ -1462,33 +1335,59 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
 
-        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbx;
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+        const int qs0 = get_int_b4(bxi->qs, threadIdx.x);
 
 #ifdef INT8_MMA_AVAILABLE
-        x_qs[i*MMQ_MMA_TILE_X_K_Q4_K + threadIdx.x] = get_int_b4(bxi->qs, kqsx);
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(threadIdx.x/8) + threadIdx.x % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F;
 #else
-        x_qs[i*(WARP_SIZE + 1)       + threadIdx.x] = get_int_b4(bxi->qs, kqsx);
+        x_qs[i*(WARP_SIZE + 1) + threadIdx.x] = qs0;
 #endif // INT8_MMA_AVAILABLE
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K;  // == 1 if QK_K == 256
-    const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+#ifdef INT8_MMA_AVAILABLE
 
 #pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + threadIdx.y * QI4_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) {
+        int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
         }
 
-        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbxd;
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+
+        const int * scales = (const int *) bxi->scales;
+        const int ksc = threadIdx.x % (WARP_SIZE/16);
+
+        const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
+        const int  m32 = unpack_scales_q45_K(scales, ksc + 2);
+
+        const uint8_t * sc8 = (const uint8_t *) &sc32;
+        const uint8_t *  m8 = (const uint8_t *)  &m32;
+
+        const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
+
+#pragma unroll
+        for (int l = 0; l < sizeof(int); ++l) {
+            x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
+        }
+    }
 
-#ifdef INT8_MMA_AVAILABLE
-        x_dm[i*MMQ_MMA_TILE_X_K_Q4_K       + kbxd] = bxi->dm;
 #else
-        x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K + kbxd] = bxi->dm;
-#endif // INT8_MMA_AVAILABLE
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI4_K) {
+        int i = (i0 + threadIdx.y*QI4_K + threadIdx.x) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+
+        x_dm[i] = bxi->dm;
     }
 
 #pragma unroll
@@ -1504,17 +1403,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
         const int * scales = (const int *) bxi->scales;
 
         const int ksc = threadIdx.x % (WARP_SIZE/8);
+        const int scales8 = unpack_scales_q45_K(scales, ksc);
 
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-#ifdef INT8_MMA_AVAILABLE
-        x_sc[i*MMQ_MMA_TILE_X_K_Q4_K + ksc] = scales8;
-#else
-        x_sc[i*(WARP_SIZE/8) + i/8   + ksc] = scales8;
-#endif // INT8_MMA_AVAILABLE
+        x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8;
     }
+#endif // INT8_MMA_AVAILABLE
 }
 
 template <int mmq_x, int mmq_y, int nwarps>
@@ -1544,133 +1437,18 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
 
                 sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
                     &x_qs[i*(WARP_SIZE + 1) + k0/2], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
-                    x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+                    x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
             }
         }
     }
 }
 
-template <int mmq_x, int mmq_y, int nwarps>
-static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mma(
-    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
-#ifdef INT8_MMA_AVAILABLE
-
-    typedef mma_int_A_I16K8 mma_A;
-    typedef mma_int_B_J8K8  mma_B;
-    typedef mma_int_C_I16J8 mma_C;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = 2 * granularity;
-    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + WARP_SIZE;
-    const int   * x_sc = (const int   *) x_dm + WARP_SIZE/QI4_K;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
-
-    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
-
-    mma_A   A[ntx][4];
-    int   scA[ntx][mma_C::ne/2][4];
-    int    mA[ntx][mma_C::ne/2][4];
-    half2 dmA[ntx][mma_C::ne/2];
-
-#pragma unroll
-    for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += 16) {
-            const int k0 = k00 + k01;
-
-            A[n][k01/8 + 0].load(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q4_K + k0/QR4_K, MMQ_MMA_TILE_X_K_Q4_K);
-
-#pragma unroll
-            for (int l = 0; l < mma_A::ne; ++l) {
-                A[n][k01/8 + 1].x[l]  = (A[n][k01/8 + 0].x[l] >> 4) & 0x0F0F0F0F;
-                A[n][k01/8 + 0].x[l] &= 0x0F0F0F0F;
-            }
-        }
-
-#pragma unroll
-        for (int l = 0; l < mma_C::ne/2; ++l) {
-            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
-
-            const int sc_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q4_K + (k00/32 + 0)];
-            const int  m_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q4_K + (k00/32 + 2)];
-
-            const uint8_t * sc = (const uint8_t *) &sc_packed;
-            const uint8_t *  m = (const uint8_t *)  &m_packed;
-
-#pragma unroll
-            for (int ksc = 0; ksc < sizeof(int); ++ksc) {
-                scA[n][l][ksc] = sc[ksc];
-                mA[n][l][ksc]  =  m[ksc];
-            }
-        }
-
-#pragma unroll
-        for (int l = 0; l < mma_C::ne/2; ++l) {
-            const int i = i0 + n*mma_A::I + mma_C::get_i(2*l);
-
-            dmA[n][l] = x_dm[i*MMQ_MMA_TILE_X_K_Q4_K];
-        }
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
-        float tmpd[ntx][mma_C::ne] = {{0.0f}};
-        float tmpm[ntx][mma_C::ne] = {{0.0f}};
-
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
-            mma_B   B;
-            half2 dsB[mma_C::ne/2];
-
-            B.load(y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
-
-#pragma unroll
-            for (int l = 0; l < mma_C::ne/2; ++l) {
-                const int j = j0 + mma_C::get_j(l);
-
-                dsB[l] = y_ds[j*MMQ_TILE_Y_K + k01/QI8_1];
-            }
-
-#pragma unroll
-            for (int n = 0; n < ntx; ++n) {
-                mma_C C;
-                C.mma_K8(A[n][k01/8], B);
-
-#pragma unroll
-                for (int l = 0; l < mma_C::ne; ++l) {
-                    tmpd[n][l] += (C.x[l]*scA[n][l/2][k01/8]) *  __low2float(dsB[l%2]);
-                    tmpm[n][l] += mA[n][l/2][k01/8]           * __high2float(dsB[l%2]);
-                }
-            }
-        }
-
-#pragma unroll
-        for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-            for (int l = 0; l < mma_C::ne; ++l) {
-                sum[(j0/mma_C::J + n)*mma_C::ne + l] += __low2float(dmA[n][l/2])*tmpd[n][l] - __high2float(dmA[n][l/2])*tmpm[n][l];
-            }
-        }
-    }
-#else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
-    NO_DEVICE_CODE;
-#endif // INT8_MMA_AVAILABLE
-}
-
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
     const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
 
 #ifdef INT8_MMA_AVAILABLE
     int   * x_qs = (int   *)  x_tile;
     half2 * x_dm = (half2 *) (x_qs + WARP_SIZE*2);
-    int   * x_sc = (int   *) (x_dm + WARP_SIZE/QI5_K);
 #else
     constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_Q5_K, mmq_y);
     int   * x_qs = (int   *)  x_tile;
@@ -1678,9 +1456,6 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
     int   * x_sc = (int   *) (x_dm + txs.dm);
 #endif // INT8_MMA_AVAILABLE
 
-    const int kbx  = 0;           // threadIdx.x / QI5_K
-    const int kqsx = threadIdx.x; // threadIdx.x % QI5_K
-
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + threadIdx.y;
@@ -1689,73 +1464,91 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
 
-        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbx;
-        const int ky = QR5_K*kqsx;
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
+        const int ky = QR5_K*threadIdx.x;
 
-        const int ql = get_int_b4(bxi->qs, kqsx);
+        const int ql = get_int_b4(bxi->qs, threadIdx.x);
         const int ql0 = (ql >> 0) & 0x0F0F0F0F;
         const int ql1 = (ql >> 4) & 0x0F0F0F0F;
 
-        const int qh = get_int_b4(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+        const int qh = get_int_b4(bxi->qh, threadIdx.x % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (threadIdx.x / (QI5_K/4)) + 1)) << 4) & 0x10101010;
 
         const int kq0 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + (QI5_K/4);
+        const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + QI5_K/4;
 
 #ifdef INT8_MMA_AVAILABLE
-        x_qs[i*MMQ_MMA_TILE_X_K_Q5_K + kq0] = ql0 | qh0;
-        x_qs[i*MMQ_MMA_TILE_X_K_Q5_K + kq1] = ql1 | qh1;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq0] = ql0 | qh0;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + kq1] = ql1 | qh1;
 #else
         x_qs[i*(2*WARP_SIZE + 1)     + kq0] = ql0 | qh0;
         x_qs[i*(2*WARP_SIZE + 1)     + kq1] = ql1 | qh1;
 #endif // INT8_MMA_AVAILABLE
     }
 
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K;  // == 1 if QK_K == 256
-    const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+#ifdef INT8_MMA_AVAILABLE
 
 #pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + threadIdx.y * QI5_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*16) {
+        int i = (i0 + threadIdx.y*16 + threadIdx.x/(WARP_SIZE/16)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
         }
 
-        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbxd;
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
+
+        const int * scales = (const int *) bxi->scales;
+        const int ksc = threadIdx.x % (WARP_SIZE/16);
+
+        const int sc32 = unpack_scales_q45_K(scales, ksc + 0);
+        const int  m32 = unpack_scales_q45_K(scales, ksc + 2);
+
+        const uint8_t * sc8 = (const uint8_t *) &sc32;
+        const uint8_t *  m8 = (const uint8_t *)  &m32;
+
+        const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
+
+#pragma unroll
+        for (int l = 0; l < sizeof(int); ++l) {
+            x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
+        }
+    }
 
-#ifdef INT8_MMA_AVAILABLE
-        x_dm[i*MMQ_MMA_TILE_X_K_Q5_K       + kbxd] = bxi->dm;
 #else
-        x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K + kbxd] = bxi->dm;
-#endif // INT8_MMA_AVAILABLE
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*QI5_K) {
+        int i = (i0 + threadIdx.y*QI5_K + threadIdx.x) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
+
+        x_dm[i] = bxi->dm;
     }
 
 #pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps*8) {
+        int i = (i0 + threadIdx.y*8 + threadIdx.x/(WARP_SIZE/8)) % mmq_y;
 
         if (need_check) {
             i = min(i, i_max);
         }
 
-        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI5_K/8);
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride;
 
         const int * scales = (const int *) bxi->scales;
 
         const int ksc = threadIdx.x % (WARP_SIZE/8);
+        const int scales8 = unpack_scales_q45_K(scales, ksc);
 
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-#ifdef INT8_MMA_AVAILABLE
-        x_sc[i*MMQ_MMA_TILE_X_K_Q5_K + ksc] = scales8;
-#else
-        x_sc[i*(WARP_SIZE/8) + i/8   + ksc] = scales8;
-#endif // INT8_MMA_AVAILABLE
+        x_sc[i*(WARP_SIZE/8) + i/8 + ksc] = scales8;
     }
+#endif // INT8_MMA_AVAILABLE
 }
 
 template <int mmq_x, int mmq_y, int nwarps>
@@ -1785,122 +1578,12 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
 
                 sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
                     &x_qs[i*(QR5_K*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01], sc, sc+8,
-                    x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+                    x_dm[i], &y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
             }
         }
     }
 }
 
-template <int mmq_x, int mmq_y, int nwarps>
-static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mma(
-    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int & k00) {
-#ifdef INT8_MMA_AVAILABLE
-
-    typedef mma_int_A_I16K8 mma_A;
-    typedef mma_int_B_J8K8  mma_B;
-    typedef mma_int_C_I16J8 mma_C;
-
-    constexpr int granularity = mmq_get_granularity_device(mmq_x);
-    constexpr int rows_per_warp = 2 * granularity;
-    constexpr int ntx = rows_per_warp/mma_C::I; // Number of x minitiles per warp.
-
-    y += (threadIdx.y % ntx) * (mma_B::J*MMQ_TILE_Y_K);
-
-    const int   * x_qs = (const int   *) x;
-    const half2 * x_dm = (const half2 *) x_qs + WARP_SIZE*2;
-    const int   * x_sc = (const int   *) x_dm + WARP_SIZE/QI5_K;
-    const int   * y_qs = (const int   *) y + 4;
-    const half2 * y_ds = (const half2 *) y;
-
-    const int i0 = (threadIdx.y / ntx) * (ntx*mma_A::I);
-
-    mma_A   A[ntx][4];
-    int   scA[ntx][mma_C::ne/2][4];
-    int    mA[ntx][mma_C::ne/2][4];
-    half2 dmA[ntx][mma_C::ne/2];
-
-#pragma unroll
-    for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
-            const int k0 = k00 + k01;
-
-            A[n][k01/8].load(x_qs + (i0 + n*mma_A::I)*MMQ_MMA_TILE_X_K_Q5_K + k0, MMQ_MMA_TILE_X_K_Q5_K);
-        }
-
-#pragma unroll
-        for (int l = 0; l < mma_C::ne/2; ++l) {
-            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
-
-            const int sc_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q5_K + (k00/32 + 0)];
-            const int  m_packed = x_sc[i*MMQ_MMA_TILE_X_K_Q5_K + (k00/32 + 2)];
-
-            const uint8_t * sc = (const uint8_t *) &sc_packed;
-            const uint8_t *  m = (const uint8_t *)  &m_packed;
-
-#pragma unroll
-            for (int ksc = 0; ksc < sizeof(int); ++ksc) {
-                scA[n][l][ksc] = sc[ksc];
-                mA[n][l][ksc]  =  m[ksc];
-            }
-        }
-
-    #pragma unroll
-        for (int l = 0; l < mma_C::ne/2; ++l) {
-            const int i = i0 + n*mma_C::I + mma_C::get_i(2*l);
-
-            dmA[n][l] = x_dm[i*MMQ_MMA_TILE_X_K_Q5_K];
-        }
-    }
-
-#pragma unroll
-    for (int j0 = 0; j0 < mmq_x; j0 += ntx*mma_C::J) {
-        float tmpd[ntx][mma_C::ne] = {{0.0f}};
-        float tmpm[ntx][mma_C::ne] = {{0.0f}};
-
-#pragma unroll
-        for (int k01 = 0; k01 < WARP_SIZE; k01 += 8) {
-            const int k0 = k00 + k01;
-
-            mma_B   B;
-            half2 dsB[mma_C::ne/2];
-
-            B.load(y_qs + j0*MMQ_TILE_Y_K + k0 % WARP_SIZE, MMQ_TILE_Y_K);
-
-#pragma unroll
-            for (int l = 0; l < mma_C::ne/2; ++l) {
-                const int j = j0 + mma_C::get_j(l);
-
-                dsB[l] = y_ds[j*MMQ_TILE_Y_K + (k0/QI8_1) % (WARP_SIZE/QI8_1)];
-            }
-
-#pragma unroll
-        for (int n = 0; n < ntx; ++n) {
-                mma_C C;
-                C.mma_K8(A[n][k01/8], B);
-
-#pragma unroll
-                for (int l = 0; l < mma_C::ne; ++l) {
-                    tmpd[n][l] += (C.x[l]*scA[n][l/2][k01/8]) *  __low2float(dsB[l%2]);
-                    tmpm[n][l] += mA[n][l/2][k01/8]           * __high2float(dsB[l%2]);
-                }
-            }
-        }
-
-#pragma unroll
-        for (int n = 0; n < ntx; ++n) {
-#pragma unroll
-            for (int l = 0; l < mma_C::ne; ++l) {
-                sum[(j0/mma_C::J + n)*mma_C::ne + l] += __low2float(dmA[n][l/2])*tmpd[n][l] - __high2float(dmA[n][l/2])*tmpm[n][l];
-            }
-        }
-    }
-#else
-    GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
-    NO_DEVICE_CODE;
-#endif // INT8_MMA_AVAILABLE
-}
-
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
     const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
 
@@ -1915,9 +1598,6 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
     int   * x_sc = (int   *) (x_df + txs.dm);
 #endif // INT8_MMA_AVAILABLE
 
-    const int kbx  = 0;           // threadIdx.x / QI6_K
-    const int kqsx = threadIdx.x; // threadIdx.x % QI6_K
-
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
         int i = i0 + threadIdx.y;
@@ -1926,19 +1606,18 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
             i = min(i, i_max);
         }
 
-        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbx;
-        const int ky = QR6_K*kqsx;
+        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride;
 
-        const int ql = get_int_b2(bxi->ql, kqsx);
+        const int ql = get_int_b2(bxi->ql, threadIdx.x);
         const int ql0 = (ql >> 0) & 0x0F0F0F0F;
         const int ql1 = (ql >> 4) & 0x0F0F0F0F;
 
-        const int qh = get_int_b2(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+        const int qh = get_int_b2(bxi->qh, (QI6_K/4) * (threadIdx.x / (QI6_K/2)) + threadIdx.x % (QI6_K/4));
+        const int qh0 = ((qh >> ((threadIdx.x & 0x08) >> 2)) << 4) & 0x30303030;
+        const int qh1 =  (qh >> ((threadIdx.x & 0x08) >> 2))       & 0x30303030;
 
-        const int kq0 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + (QI6_K/2);
+        const int kq0 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + 0;
+        const int kq1 = 2*threadIdx.x - threadIdx.x % (QI6_K/2) + QI6_K/2;
 
 #ifdef INT8_MMA_AVAILABLE
         x_qs[i*MMQ_MMA_TILE_X_K_Q6_K + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
@@ -2187,6 +1866,358 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
     }
 }
 
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xxs(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef INT8_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_XXS, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // INT8_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI2_XXS/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XXS/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XXS) + threadIdx.x/(QI2_XXS/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq2_xxs * bxi = (const block_iq2_xxs *) x + kbx0 + i*stride;
+
+        const int q2 = get_int_b2(bxi->qs, 2*kqsx+0);
+        const uint8_t * aux8 = (const uint8_t *) &q2;
+        const uint32_t aux32 = get_int_b2(bxi->qs, 2*kqsx+1);
+
+#pragma unroll
+        for (int l = 0; l < QR2_XXS; ++l) {
+            const int * grid_pos = (const int *) (iq2xxs_grid + aux8[l]);
+            const int signs_packed = ksigns_iq2xs[(aux32 >> (7*l)) & 0x7F];
+
+            const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
+            const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
+
+            const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
+            const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
+
+#ifdef INT8_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid1;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid0;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid1;
+#endif // INT8_MMA_AVAILABLE
+        }
+
+        const int ls = aux32 >> 28;
+        const float d = bxi->d;
+#ifdef INT8_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4;
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + kqsx] = (ls*d + d/2)/4;
+#endif // INT8_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_xs(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef INT8_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = MMQ_DP4A_TXS_Q8_0_16;
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // INT8_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI2_XS/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_XS/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_XS) + threadIdx.x/(QI2_XS/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq2_xs * bxi = (const block_iq2_xs *) x + kbx0 + i*stride;
+
+        const int2 q2_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
+        const uint16_t * q2 = (const uint16_t *) &q2_packed;
+
+    #pragma unroll
+        for (int l = 0; l < QR2_XS; ++l) {
+            const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l] & 0x000001FF));
+            const uint32_t * signs    = (const uint32_t *)(ksigns64   + (q2[l] >> 9));
+
+            const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
+            const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
+
+#ifdef INT8_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid_h;
+#endif // INT8_MMA_AVAILABLE
+        }
+
+        const int ls = bxi->scales[kqsx];
+        const float d = bxi->d;
+#ifdef INT8_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#else
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#endif // INT8_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq2_s(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef INT8_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ2_S, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // INT8_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI2_S/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI2_S/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI2_S) + threadIdx.x/(QI2_S/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq2_s * bxi = (const block_iq2_s *) x + kbx0 + i*stride;
+
+        const int       qs_packed = get_int_b2(bxi->qs, kqsx);
+        const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+        const int qh = bxi->qh[kqsx];
+
+        const int       signs_packed_32 = get_int_b2(bxi->qs, QK_K/32 + kqsx);
+        const uint8_t * signs_packed_8  = (const uint8_t *) &signs_packed_32;
+
+#pragma unroll
+        for (int l = 0; l < QR2_S; ++l) {
+            const int * grid_pos = (const int *)(iq2s_grid + (qs[l] | ((qh << (8-2*l)) & 0x300)));
+
+            const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
+            const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
+
+            const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
+            const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
+
+#ifdef INT8_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid_h;
+#endif // INT8_MMA_AVAILABLE
+        }
+
+        const int ls = bxi->scales[kqsx];
+        const float d = bxi->d;
+#ifdef INT8_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*MMQ_MMA_TILE_X_K_Q3_K               + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#else
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+0] = ((ls &  0x0F)*d + d/2)/4;
+        x_df[i*(2*WARP_SIZE*2/QI8_0) + i/(QI8_0/4) + 2*kqsx+1] = ((ls >>    4)*d + d/2)/4;
+#endif // INT8_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_xxs(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef INT8_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_XXS, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // INT8_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI3_XXS/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_XXS/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_XXS) + threadIdx.x/(QI3_XXS/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq3_xxs * bxi = (const block_iq3_xxs *) x + kbx0 + i*stride;
+
+        const int2 q3_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
+        const uint8_t * q3 = (const uint8_t *) &q3_packed;
+        const uint32_t aux32 = get_int_b2(bxi->qs, QK_K/16 + kqsx);
+
+#pragma unroll
+        for (int l = 0; l < QR3_XXS; ++l) {
+            const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
+
+            const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l)) & 0x7F));
+
+            const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
+            const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
+
+#ifdef INT8_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l + 1)] = grid_h;
+#endif // INT8_MMA_AVAILABLE
+        }
+
+        const int ls = aux32 >> 28;
+        const float d = bxi->d;
+#ifdef INT8_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/2;
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + kqsx] = (ls*d + d/2)/2;
+#endif // INT8_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq3_s(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef INT8_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    float * x_df = (float *) (x_qs + txs.qs);
+#endif // INT8_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % (QI3_S/2);
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/(QI3_S/2)) {
+        int i = i0 + threadIdx.y*(2*WARP_SIZE/QI3_S) + threadIdx.x/(QI3_S/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq3_s * bxi = (const block_iq3_s *) x + kbx0 + i*stride;
+
+        const int2      qs_packed = make_int2(get_int_b2(bxi->qs, 2*kqsx+0), get_int_b2(bxi->qs, 2*kqsx+1));
+        const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+        const int qh = bxi->qh[kqsx];
+
+        const int       signs_packed_32 = get_int_b2(bxi->signs, kqsx);
+        const uint8_t * signs_packed_8  = (const uint8_t *) &signs_packed_32;
+
+#pragma unroll
+        for (int l = 0; l < QR3_S; ++l) {
+            const int2 grid_pos = make_int2(
+                iq3s_grid[qs[2*l+0] | ((qh << (8 - 2*l)) & 0x100)],
+                iq3s_grid[qs[2*l+1] | ((qh << (7 - 2*l)) & 0x100)]);
+
+            const int signs0 = __vcmpne4(((signs_packed_8[l] & 0x03) << 7) | ((signs_packed_8[l] & 0x0C) << 21), 0x00000000);
+            const int signs1 = __vcmpne4(((signs_packed_8[l] & 0x30) << 3) | ((signs_packed_8[l] & 0xC0) << 17), 0x00000000);
+
+            const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
+            const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
+
+#ifdef INT8_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+0)] = grid_l;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l+1)] = grid_h;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+0)] = grid_l;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+1)] = grid_h;
+#endif // INT8_MMA_AVAILABLE
+        }
+
+        const int ls = 1 + 2*((bxi->scales[kqsx/2] >> (((2*kqsx) << 1) & 0x04)) & 0x0F);
+        const float d = bxi->d;
+#ifdef INT8_MMA_AVAILABLE
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = ls*d;
+#else
+        x_df[i*(WARP_SIZE/4) + i/4   + kqsx] = ls*d;
+#endif // INT8_MMA_AVAILABLE
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq1_s(
+    const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
+
+#ifdef INT8_MMA_AVAILABLE
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_ds = (half2 *) (x_qs + WARP_SIZE*2);
+#else
+    constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_IQ3_S, mmq_y);
+    int   * x_qs = (int   *)  x_tile;
+    half2 * x_ds = (half2 *) (x_qs + txs.qs);
+#endif // INT8_MMA_AVAILABLE
+
+    const int kqsx = threadIdx.x % QI1_S;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * WARP_SIZE/QI1_S) {
+        int i = i0 + threadIdx.y*(WARP_SIZE/QI1_S) + threadIdx.x/QI1_S;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_iq1_s * bxi = (const block_iq1_s *) x + kbx0 + i*stride;
+
+        const int       qs_packed = get_int_b2(bxi->qs, kqsx);
+        const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+        const int qh = bxi->qh[kqsx];
+
+    #pragma unroll
+        for (int l = 0; l < QR1_S/2; ++l) {
+            const int grid = iq1s_grid_gpu[qs[l] | (((qh >> (3*l)) & 0x07) << 8)];
+
+            const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+            const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+#ifdef INT8_MMA_AVAILABLE
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+0)] = grid0;
+            x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 8*kqsx + (2*l+1)] = grid1;
+#else
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+0)] = grid0;
+            x_qs[i*(2*WARP_SIZE + 1)     + 8*kqsx + (2*l+1)] = grid1;
+#endif // INT8_MMA_AVAILABLE
+        }
+
+        const float  d1q   = __half2float(bxi->d) * (((qh >> 11) & 0x0E) + 1);
+        const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
+
+#ifdef INT8_MMA_AVAILABLE
+        x_ds[i*MMQ_MMA_TILE_X_K_Q8_1 + kqsx] = make_half2(d1q, d1q*delta);
+#else
+        x_ds[i*(WARP_SIZE/4) + i/4   + kqsx] = make_half2(d1q, d1q*delta);
+#endif // INT8_MMA_AVAILABLE
+    }
+}
+
 template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_iq4_xs(
     const char * __restrict__ x, int * __restrict__ x_tile, const int & kbx0, const int & i_max, const int & stride) {
 
@@ -2320,7 +2351,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
     static constexpr int              vdr          = VDR_Q4_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_0<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q4_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_DS4>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2328,7 +2359,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
     static constexpr int              vdr          = VDR_Q4_1_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_1<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q4_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2336,7 +2367,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
     static constexpr int              vdr          = VDR_Q5_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q5_0<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2352,7 +2383,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
     static constexpr int              vdr          = VDR_Q8_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q8_0<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2368,7 +2399,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
     static constexpr int              vdr          = VDR_Q3_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q3_K<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q3_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q3_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2376,7 +2407,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
     static constexpr int              vdr          = VDR_Q4_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_K<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q4_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2384,7 +2415,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
     static constexpr int              vdr          = VDR_Q5_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q5_K<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q5_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q5_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2396,11 +2427,59 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q6_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_XXS> {
+    static constexpr int              vdr          = VDR_IQ2_XXS_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq2_xxs<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_XS> {
+    static constexpr int              vdr          = VDR_IQ2_XS_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq2_xs<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ2_S> {
+    static constexpr int              vdr          = VDR_IQ2_S_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq2_s<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ3_XXS> {
+    static constexpr int              vdr          = VDR_IQ3_XXS_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq3_xxs<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ3_S> {
+    static constexpr int              vdr          = VDR_IQ3_S_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq3_s<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ1_S> {
+    static constexpr int              vdr          = VDR_IQ1_S_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq1_s<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
+};
+
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ4_NL> {
     static constexpr int              vdr          = VDR_IQ4_NL_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq4_nl<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2408,7 +2487,7 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_IQ4_XS> {
     static constexpr int              vdr          = VDR_IQ4_XS_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_iq4_xs<mmq_y, nwarps, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps, MMQ_Q8_1_DS_LAYOUT_D4>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
 };
 
@@ -2837,6 +2916,12 @@ extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
 extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
 extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
 extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
+extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
 extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL);
 extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS);
 
diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
index ffeb3c27d..d7874e6ea 100755
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
 TYPES_MMQ = [
     "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
     "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
-    "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
+    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
+    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
 ]
 
 SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
new file mode 100644
index 000000000..84ec85029
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
new file mode 100644
index 000000000..583c4e5a5
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
new file mode 100644
index 000000000..edaf1560d
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
new file mode 100644
index 000000000..233d9342c
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
new file mode 100644
index 000000000..6092dc713
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
diff --git a/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
new file mode 100644
index 000000000..1d5bd201f
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
index 6a17d0f3e..40091a0ef 100644
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
     return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 }
 
+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
+    const int * v, const int * u, const float * d8_0, const float & d8_1) {
+
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
+        int sumi = 0;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_0/2; ++i) {
+            // SIMD dot product of quantized values
+            sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
+        }
+
+        sumf += d8_0[i0/(QI8_0/2)]*sumi;
+    }
+
+    return d8_1*sumf;
+}
+
 #define VDR_Q2_K_Q8_1_MMVQ 1
 #define VDR_Q2_K_Q8_1_MMQ  4
 
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 1defed3ca..47418597c 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4748,7 +4748,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
 
         int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
 
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
+        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
     }
 
 #elif defined(__POWER9_VECTOR__)
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 68d41411b..397bd98dd 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
 
     queue_ptr stream(int device, int stream) {
         if (qptrs[device][stream] == nullptr) {
-            qptrs[device][stream] = &(dpct::get_current_device().default_queue());
+            qptrs[device][stream] = &(dpct::get_device(device).default_queue());
         }
         return qptrs[device][stream];
     }
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 31df1cb9e..4aaa76bfb 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -588,7 +588,7 @@ namespace dpct
         out = prop;
     }
 
-   /// dpct device extension
+    /// dpct device extension
     class device_ext : public sycl::device {
       typedef std::mutex mutex_type;
 
@@ -697,7 +697,7 @@ namespace dpct
         std::unique_lock<mutex_type> lock(m_mutex);
         lock.unlock();
         for (auto &q : _queues) {
-          q.wait_and_throw();
+            q.wait_and_throw();
         }
         // Guard the destruct of current_queues to make sure the ref count is
         // safe.
@@ -734,7 +734,12 @@ namespace dpct
 
       void destroy_queue(sycl::queue queue) {
         std::lock_guard<mutex_type> lock(m_mutex);
-        _queues.clear();
+        _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                    [=](const sycl::queue &q) -> bool
+                                    {
+                                        return q == queue;
+                                    }),
+                    _queues.end());
       }
       void set_saved_queue(sycl::queue q) {
         std::lock_guard<mutex_type> lock(m_mutex);
@@ -764,13 +769,13 @@ namespace dpct
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        auto q = sycl::queue(*this, eh,
-                             sycl::property_list(
+        _queues.push_back(sycl::queue(
+            *this, eh,
+            sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                                 sycl::property::queue::enable_profiling(),
+                sycl::property::queue::enable_profiling(),
 #endif
-                                 properties...));
-        _queues.push_back(q);
+                properties...)));
 
         return _queues.back();
       }
@@ -783,8 +788,8 @@ namespace dpct
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        _queues.push_back(
-            sycl::queue(device, eh,
+        _queues.push_back(sycl::queue(
+            device, eh,
                         sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                             sycl::property::queue::enable_profiling(),
@@ -855,15 +860,75 @@ namespace dpct
         unsigned int get_device_id(const sycl::device &dev)
         {
             unsigned int id = 0;
-            for (auto dev_item : _devs)
+            for (auto &dev_item : _devs)
             {
                 if (*dev_item == dev)
                 {
-                    break;
+                    return id;
                 }
                 id++;
             }
-            return id;
+            return -1;
+        }
+
+        inline std::string get_preferred_gpu_platform_name() {
+            std::string result;
+
+            std::string filter = "level-zero";
+            char* env = getenv("ONEAPI_DEVICE_SELECTOR");
+            if (env) {
+                if (std::strstr(env, "level_zero")) {
+                    filter = "level-zero";
+                }
+                else if (std::strstr(env, "opencl")) {
+                    filter = "opencl";
+                }
+                else if (std::strstr(env, "cuda")) {
+                    filter = "cuda";
+                }
+                else if (std::strstr(env, "hip")) {
+                    filter = "hip";
+                }
+                else {
+                    throw std::runtime_error("invalid device filter: " + std::string(env));
+                }
+            }
+
+            auto plaform_list = sycl::platform::get_platforms();
+
+            for (const auto& platform : plaform_list) {
+                auto devices = platform.get_devices();
+                auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
+                    return d.is_gpu();
+                });
+
+                if (gpu_dev == devices.end()) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not contain GPU devices, skipping\n";
+                    continue;
+                }
+
+                auto platform_name = platform.get_info<sycl::info::platform::name>();
+                std::string platform_name_low_case;
+                platform_name_low_case.resize(platform_name.size());
+
+                std::transform(
+                    platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
+
+                if (platform_name_low_case.find(filter) == std::string::npos) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not match with requested "
+                    //      << filter << ", skipping\n";
+                    continue;
+                }
+
+                result = platform_name;
+            }
+
+            if (result.empty())
+                throw std::runtime_error("can not find preferred GPU platform");
+
+            return result;
         }
 
         template <class DeviceSelector>
@@ -930,10 +995,15 @@ namespace dpct
             // Keep track of the number of devices per backend
             std::map<sycl::backend, size_t> DeviceNums;
             std::map<std::string, std::vector<sycl::device>> backend_devices;
+            auto preferred_platform_name = get_preferred_gpu_platform_name();
 
             while (!Platforms.empty()) {
                 auto Platform = Platforms.back();
                 Platforms.pop_back();
+                auto platform_name = Platform.get_info<sycl::info::platform::name>();
+                if (platform_name.compare(preferred_platform_name) != 0) {
+                    continue;
+                }
                 auto devices = Platform.get_devices();
                 std::string backend_type = get_device_backend_and_type(devices[0]);
                 for (const auto &device : devices) {
@@ -1989,6 +2059,11 @@ namespace dpct
         return dev_mgr::instance().current_device();
     }
 
+    static inline device_ext &get_device(unsigned int id)
+    {
+        return dev_mgr::instance().get_device(id);
+    }
+
     static inline sycl::queue &get_in_order_queue()
     {
         return dev_mgr::instance().current_device().in_order_queue();
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
index c5d9a837e..17a542e49 100644
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -152,7 +152,8 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
 
     const sycl::range<3> block_dims(1, 1, nth);
     const sycl::range<3> block_nums(1, 1, nrows_x);
-    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
+    const size_t n_val_tmp = nth / WARP_SIZE;
+    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + n_val_tmp);
 
     const uint32_t n_head_kv   = nrows_x/nrows_y;
     const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 8efe32329..6bcd81a7b 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -38,8 +38,6 @@
 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI 1
 #define VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE 2
 
-#define VK_NUM_TYPES 16
-
 #define GGML_VK_MAX_NODES 8192
 
 #define MAX_VK_BUFFERS 256
@@ -162,23 +160,23 @@ struct vk_device_struct {
     vk_matmul_pipeline pipeline_matmul_f16_f32;
     vk_pipeline pipeline_matmul_split_k_reduce;
 
-    vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES];
+    vk_matmul_pipeline pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT];
 
     vk_matmul_pipeline pipeline_matmul_id_f32;
     vk_matmul_pipeline pipeline_matmul_id_f16;
     vk_matmul_pipeline pipeline_matmul_id_f16_f32;
 
-    vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES];
+    vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT];
 
-    vk_pipeline pipeline_dequant[VK_NUM_TYPES];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES];
-    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
 
     vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
     vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
-    vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
-    vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
+    vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
     vk_pipeline pipeline_mul_f32;
     vk_pipeline pipeline_div_f32;
     vk_pipeline pipeline_add_f32;
@@ -1059,25 +1057,6 @@ static void ggml_vk_wait_events(vk_context * ctx, std::vector<vk::Event>&& event
     );
 }
 
-static bool ggml_vk_build_shader(ggml_type type) {
-    switch(type) {
-    case GGML_TYPE_F16:
-    case GGML_TYPE_Q4_0:
-    case GGML_TYPE_Q4_1:
-    case GGML_TYPE_Q5_0:
-    case GGML_TYPE_Q5_1:
-    case GGML_TYPE_Q8_0:
-    case GGML_TYPE_Q2_K:
-    case GGML_TYPE_Q3_K:
-    case GGML_TYPE_Q4_K:
-    case GGML_TYPE_Q5_K:
-    case GGML_TYPE_Q6_K:
-        return true;
-    default:
-        return false;
-    }
-}
-
 static void ggml_vk_load_shaders(vk_device& device) {
     VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
 
@@ -1112,6 +1091,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
+    device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
 
     device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_matmul_id_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1126,6 +1106,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
     device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
+    device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
 
     if (device->fp16) {
         ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
@@ -1226,6 +1207,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
 
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_len, matmul_iq4_nl_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_len, matmul_iq4_nl_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+
         ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
         ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
         ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1316,6 +1304,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_len, matmul_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_len, matmul_id_iq4_nl_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
     } else {
         ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
         ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1415,6 +1410,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
 
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->l, "matmul_iq4_nl_f32_l", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->m, "matmul_iq4_nl_f32_m", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->s, "matmul_iq4_nl_f32_s", matmul_iq4_nl_f32_fp32_len, matmul_iq4_nl_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_l, "matmul_iq4_nl_f32_aligned_l", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_m, "matmul_iq4_nl_f32_aligned_m", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL]->a_s, "matmul_iq4_nl_f32_aligned_s", matmul_iq4_nl_f32_aligned_fp32_len, matmul_iq4_nl_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+
         ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1);
         ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1);
         ggml_vk_create_pipeline(device, device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1505,6 +1507,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->l, "matmul_id_iq4_nl_f32_l", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->m, "matmul_id_iq4_nl_f32_m", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->s, "matmul_id_iq4_nl_f32_s", matmul_id_iq4_nl_f32_fp32_len, matmul_id_iq4_nl_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_l, "matmul_id_iq4_nl_f32_aligned_l", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_mmq_l, l_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_m, "matmul_id_iq4_nl_f32_aligned_m", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL]->a_s, "matmul_id_iq4_nl_f32_aligned_s", matmul_id_iq4_nl_f32_aligned_fp32_len, matmul_id_iq4_nl_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align);
     }
 
     // mul mat vec
@@ -1520,6 +1529,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32",  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32",  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1533,6 +1543,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",  mul_mat_vec_id_f16_f32_len,  mul_mat_vec_id_f16_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -1546,6 +1557,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
     // dequant shaders
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -1559,6 +1571,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_IQ4_NL], "dequant_iq4_nl", dequant_iq4_nl_len, dequant_iq4_nl_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
 
     // get_rows
     ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32",  get_rows_f32_len,  get_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1568,6 +1581,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -1576,6 +1590,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
 
@@ -2087,6 +2102,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ4_NL:
             break;
         default:
             return nullptr;
@@ -2123,6 +2139,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ4_NL:
             break;
         default:
             return nullptr;
@@ -2148,6 +2165,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ4_NL:
             break;
         default:
             return nullptr;
@@ -2181,6 +2199,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ4_NL:
             break;
         default:
             return nullptr;
@@ -2206,6 +2225,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
+        case GGML_TYPE_IQ4_NL:
             break;
         default:
             return nullptr;
@@ -3431,7 +3451,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
 
     const uint64_t nei0 = ids->ne[0];
     const uint64_t nei1 = ids->ne[1];
-    GGML_ASSERT(nei0 * nei1 <= 2048);
+    GGML_ASSERT(nei0 * nei1 <= 3072);
 
     const uint32_t nbi1 = ids->nb[1];
     const uint32_t nbi2 = ids->nb[2];
@@ -3443,8 +3463,6 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
 
     const uint64_t n_as = ne02;
 
-    GGML_ASSERT(n_as <= 8);
-
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
     ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
@@ -4623,22 +4641,22 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
         }
     }
 
-    ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
+    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
     if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
 
         if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
             // Resize buffer
             if (ctx->prealloc_split_k != nullptr) {
                 ggml_vk_destroy_buffer(ctx->prealloc_split_k);
             }
-            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
         }
     }
 
-    vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_D = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
 
     X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
     Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
@@ -4665,12 +4683,12 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
         }
     }
 
-    ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
-    ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
+    ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
+    ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
 
     vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
     for (size_t i = 0; i < num_it; i++) {
-        ggml_vk_ctx_begin(ctx, subctx);
+        ggml_vk_ctx_begin(ctx->device, subctx);
         ggml_vk_matmul(
             ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k),
             m, n, k,
@@ -4689,7 +4707,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
     double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
 
     // copy dst to host
-    ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne);
+    ggml_vk_buffer_read(d_D, 0, d, sizeof(float) * d_ne);
 
     float * d_chk = (float *) malloc(sizeof(float) * d_ne);
 
@@ -4765,7 +4783,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
 
         if (split_k > 1) {
             float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
-            ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
 
             std::cerr << "d_buf0: " << std::endl << std::endl;
             ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -4785,8 +4803,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
 
     free(d_chk);
 
-    ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
-    ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
+    ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
 
     ggml_vk_destroy_buffer(d_X);
     ggml_vk_destroy_buffer(d_Y);
@@ -4834,90 +4852,23 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
     }
 }
 
-static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) {
-    VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")");
-    // Check transfers are correct
-    vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
-
-    float * x;
-    float * y;
-    if (pinned) {
-        x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
-        y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne);
-    } else {
-        x = (float *) malloc(sizeof(float) * ne);
-        y = (float *) malloc(sizeof(float) * ne);
-    }
-
-    for (size_t i = 0; i < ne; i++) {
-        x[i] = rand() / (float)RAND_MAX;
-    }
-
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
-    ggml_vk_ctx_begin(ctx, subctx);
-
-    auto begin = std::chrono::high_resolution_clock::now();
-
-    ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne);
-
-    for (auto& cpy : subctx->in_memcpys) {
-        memcpy(cpy.dst, cpy.src, cpy.n);
-    }
-    subctx->in_memcpys.clear();
-
-    ggml_vk_ctx_end(subctx);
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
-    ctx->device->device.resetFences({ ctx->fence });
-
-    auto end = std::chrono::high_resolution_clock::now();
-
-    double ms_to_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-
-    ggml_vk_ctx_begin(ctx, subctx);
-
-    begin = std::chrono::high_resolution_clock::now();
-
-    ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne);
-
-    ggml_vk_ctx_end(subctx);
-    ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
-    ctx->device->device.resetFences({ ctx->fence });
-
-    for (auto& cpy : subctx->out_memcpys) {
-        memcpy(cpy.dst, cpy.src, cpy.n);
-    }
-    subctx->out_memcpys.clear();
-
-    end = std::chrono::high_resolution_clock::now();
-
-    double ms_from_gpu = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-
-    double avg_err = 0.0;
-    for (size_t i = 0; i < ne; i++) {
-        avg_err += std::fabs(x[i] - y[i]);
-    }
-
-    double kb = ne * sizeof(float) / 1024.0;
-
-    std::cerr << "TEST TRANSFER " << kb << " KB to_gpu " << ms_to_gpu << "ms (" << kb / ms_to_gpu * 1000.0 / 1024.0 << " MB/s) from_gpu " << ms_from_gpu << "ms (" << kb / ms_from_gpu * 1000.0 / 1024.0 << " MB/s) avg_err=" << avg_err / ne << std::endl;
-
-    ggml_vk_destroy_buffer(buffer);
-
-    if (pinned) {
-        ggml_vk_host_free(ctx, x);
-        ggml_vk_host_free(ctx, y);
-    } else {
-        free(x);
-        free(y);
-    }
-}
-
 static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
     ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
 }
 
+static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, ggml_type quant) {
+    if (quant == GGML_TYPE_F32) {
+        memcpy(to, from, sizeof(float) * ne);
+        return;
+    }
+
+    ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
+
+    ggml_to_float_t dequant_fn = tt.to_float;
+
+    dequant_fn(from, to, ne);
+}
+
 static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
     VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")");
     const size_t x_sz = sizeof(float) * ne;
@@ -4925,24 +4876,26 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
     const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant);
     float * x = (float *) malloc(x_sz);
     void * qx = malloc(qx_sz);
-    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer x_buf = ggml_vk_create_buffer_check(ctx->device, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    float * x_ref = (float *) malloc(x_sz);
     ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16);
 
     for (size_t i = 0; i < ne; i++) {
         x[i] = rand() / (float)RAND_MAX;
     }
 
-    vk_pipeline p = ctx->device->pipeline_dequant[quant];
+    vk_pipeline p = ggml_vk_get_to_fp16(ctx, quant);
 
     ggml_vk_quantize_data(x, qx, ne, quant);
+    ggml_vk_dequantize_data(qx, x_ref, ne, quant);
 
-    ggml_pipeline_allocate_descriptor_sets(ctx, p, 1);
+    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, 1);
 
-    ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
+    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
 
     vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
-    ggml_vk_ctx_begin(ctx, subctx);
+    ggml_vk_ctx_begin(ctx->device, subctx);
     const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
     ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
     ggml_vk_ctx_end(subctx);
@@ -4956,13 +4909,13 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
     auto end = std::chrono::high_resolution_clock::now();
 
     double ms_dequant = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-    ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16);
+    ggml_vk_buffer_read(x_buf, 0, x_chk, x_sz_f16);
 
     int first_err = -1;
 
     double avg_err = 0.0;
     for (size_t i = 0; i < ne; i++) {
-        double error = std::fabs(x[i] - ggml_fp16_to_fp32(x_chk[i]));
+        double error = std::fabs(x_ref[i] - ggml_fp16_to_fp32(x_chk[i]));
         avg_err += error;
 
         if (first_err < 0 && error > 0.05) {
@@ -4982,7 +4935,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
         }
         std::cerr << std::endl << "Expected result: " << std::endl << std::endl;
         for (int i = std::max(0, first_err - 5); i < std::min((int)ne, first_err + 5); i++) {
-            std::cerr << x[i] << ", ";
+            std::cerr << x_ref[i] << ", ";
         }
         std::cerr << std::endl;
     }
@@ -4992,6 +4945,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
 
     free(x);
     free(qx);
+    free(x_ref);
     free(x_chk);
 }
 
@@ -5040,9 +4994,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
     float * x = (float *) malloc(x_sz);
     float * y = (float *) malloc(y_sz);
     void * qx = malloc(qx_sz);
-    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer y_buf = ggml_vk_create_buffer_check(ctx, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
-    vk_buffer d_buf = ggml_vk_create_buffer_check(ctx, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx->device, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer y_buf = ggml_vk_create_buffer_check(ctx->device, y_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
+    vk_buffer d_buf = ggml_vk_create_buffer_check(ctx->device, d_sz, vk::MemoryPropertyFlagBits::eDeviceLocal);
     float * d = (float *) malloc(d_sz);
     float * d_chk = (float *) malloc(d_sz);
 
@@ -5057,25 +5011,25 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
         y[i] = (i % k == i / k) ? 1.0f : 0.0f;
     }
 
-    ggml_pipeline_allocate_descriptor_sets(ctx, p, num_it);
+    ggml_pipeline_allocate_descriptor_sets(ctx->device, p, num_it);
     if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_allocate_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
 
         if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
             // Resize buffer
             if (ctx->prealloc_split_k != nullptr) {
                 ggml_vk_destroy_buffer(ctx->prealloc_split_k);
             }
-            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
+            ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx->device, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
         }
     }
 
-    ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
-    ggml_vk_buffer_write(ctx, y_buf, 0, y, y_sz);
+    ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
+    ggml_vk_buffer_write(y_buf, 0, y, y_sz);
 
     vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
     for (size_t i = 0; i < num_it; i++) {
-        ggml_vk_ctx_begin(ctx, subctx);
+        ggml_vk_ctx_begin(ctx->device, subctx);
         ggml_vk_matmul(
             ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k),
             m, n, k,
@@ -5094,7 +5048,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
     auto end = std::chrono::high_resolution_clock::now();
 
     double time_ms = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
-    ggml_vk_buffer_read(ctx, d_buf, 0, d, d_sz);
+    ggml_vk_buffer_read(d_buf, 0, d, d_sz);
 
     ggml_init_params iparams = {
         /*.mem_size   =*/ 1024*1024*1024,
@@ -5149,7 +5103,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
 
         if (split_k > 1) {
             float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k);
-            ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
+            ggml_vk_buffer_read(ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k);
 
             std::cerr << "d_buf0: " << std::endl << std::endl;
             ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
@@ -5302,12 +5256,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
 
 static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
 #if defined(GGML_VULKAN_RUN_TESTS)
-    ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
+    ctx->staging = ggml_vk_create_buffer_check(ctx->device, 100ul * 1024ul * 1024ul,
         vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
         vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
-    ggml_vk_test_transfer(ctx, 8192 * 1000, false);
-    ggml_vk_test_transfer(ctx, 8192 * 1000, true);
-
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32);
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0);
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1);
@@ -5319,85 +5270,90 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K);
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K);
     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K);
+    ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL);
 
     ggml_vk_test_matmul<ggml_fp16_t, ggml_fp16_t>(ctx, 512, 512, 100, 32, 100, 1, 2);
 
     ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 0);
     ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 1);
     ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 1, 2);
-    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
-    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
-    ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
+    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 0);
+    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 1);
+    // ggml_vk_test_matmul<float, float>(ctx, 128, 512, 512, 2, 100, 4, 2);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
 
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
     ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
-    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
+    // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
+
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL);
+    ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL);
 
     std::cerr << std::endl;
 
@@ -5429,9 +5385,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
         ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0);
         ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1);
         ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
-        ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
+        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0);
+        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1);
+        // ggml_vk_test_matmul<ggml_fp16_t, float>(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2);
         std::cerr << std::endl;
     }
 
@@ -6263,6 +6219,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                     case GGML_TYPE_Q4_K:
                     case GGML_TYPE_Q5_K:
                     case GGML_TYPE_Q6_K:
+                    case GGML_TYPE_IQ4_NL:
                         break;
                     default:
                         return false;
@@ -6291,6 +6248,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                     case GGML_TYPE_Q5_0:
                     case GGML_TYPE_Q5_1:
                     case GGML_TYPE_Q8_0:
+                    case GGML_TYPE_IQ4_NL:
                         return true;
                     default:
                         return false;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7a39c685b..29afcc7f8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -21015,7 +21015,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             gguf_tensor_info_sanitize(info);
 
             // make sure there is no duplicated tensor names
-            for (uint64_t j = 0; j < i; ++j) {
+            for (uint64_t j = 0; j < i && ok; ++j) {
                 if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
                     fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
                     ok = false;
@@ -21096,6 +21096,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         };
 
         *params.ctx = ggml_init(pdata);
+        if (*params.ctx == NULL) {
+            fprintf(stderr, "%s: failed to initialize context\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
 
         struct ggml_context * ctx_data = *params.ctx;
 
@@ -22005,6 +22011,14 @@ int ggml_cpu_has_cann(void) {
 #endif
 }
 
+int ggml_cpu_has_llamafile(void) {
+#if defined(GGML_USE_LLAMAFILE)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_gpublas(void) {
     return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
 }
diff --git a/ggml/src/vulkan-shaders/dequant_funcs.comp b/ggml/src/vulkan-shaders/dequant_funcs.comp
index 35d424d18..d5b989735 100644
--- a/ggml/src/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/vulkan-shaders/dequant_funcs.comp
@@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
 }
 #endif
+
+#if defined(DATA_A_IQ4_NL)
+vec2 dequantize(uint ib, uint iqs, uint a_offset) {
+    const float d = float(data_a[a_offset + ib].d);
+    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
+    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
+}
+#endif
diff --git a/ggml/src/vulkan-shaders/dequant_iq4_nl.comp b/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
new file mode 100644
index 000000000..34ef3da30
--- /dev/null
+++ b/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
@@ -0,0 +1,30 @@
+#version 450
+
+#include "dequant_head.comp"
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
+
+void main() {
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
+
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
+        return;
+    }
+
+    const uint q_idx = 8*il;
+    const uint b_idx = 1024*i + 32*ir + q_idx;
+
+    const float d = float(data_a[ib].d);
+
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
+        data_b[b_idx + l +  0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
+        data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
+    }
+}
diff --git a/ggml/src/vulkan-shaders/dequant_q4_0.comp b/ggml/src/vulkan-shaders/dequant_q4_0.comp
index 11e07e66b..408185327 100644
--- a/ggml/src/vulkan-shaders/dequant_q4_0.comp
+++ b/ggml/src/vulkan-shaders/dequant_q4_0.comp
@@ -18,15 +18,13 @@ void main() {
         return;
     }
 
-    const uint b_idx = 1024*i + 32*ir + 8*il;
+    const uint q_idx = 8*il;
+    const uint b_idx = 1024*i + 32*ir + q_idx;
 
     const float d = float(data_a[ib].d);
-    const float dm = -8.0f * d;
-
-    const uint q_idx = 8*il;
 
     [[unroll]] for (uint l = 0; l < 8; ++l) {
-        data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
-        data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + dm);
+        data_b[b_idx + l +  0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
+        data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >>  4) - 8.0f));
     }
 }
diff --git a/ggml/src/vulkan-shaders/mul_mm.comp b/ggml/src/vulkan-shaders/mul_mm.comp
index 7c2b45cce..5fe9d5241 100644
--- a/ggml/src/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/vulkan-shaders/mul_mm.comp
@@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
 shared FLOAT_TYPE buf_b[BN * (BK+1)];
 
 #ifdef MUL_MAT_ID
-shared u16vec2 row_ids[2048];
+shared u16vec2 row_ids[3072];
 #endif
 
 void main() {
@@ -380,6 +380,19 @@ void main() {
 
             buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
             buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
+#elif defined(DATA_A_IQ4_NL)
+            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
+            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
+
+            const uint ib = idx / 16;
+            const uint iqs = idx & 0xF;
+
+            const float d = float(data_a[ib].d);
+            const uint vui = uint(data_a[ib].qs[iqs]);
+            const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
 #endif
         }
         [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
diff --git a/ggml/src/vulkan-shaders/types.comp b/ggml/src/vulkan-shaders/types.comp
index 815fcbecd..d24c172ca 100644
--- a/ggml/src/vulkan-shaders/types.comp
+++ b/ggml/src/vulkan-shaders/types.comp
@@ -177,3 +177,24 @@ struct block_q6_K
 
 #define A_TYPE block_q6_K
 #endif
+
+// IQuants
+
+#if defined(DATA_A_IQ4_NL)
+#extension GL_EXT_shader_16bit_storage : require
+#define QUANT_K 32
+#define QUANT_R 2
+
+struct block_iq4_nl
+{
+    float16_t d;
+    uint8_t qs[QUANT_K/2];
+};
+
+#define A_TYPE block_iq4_nl
+
+const int8_t kvalues_iq4nl[16] = {
+    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
+    int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
+};
+#endif
diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
index 3038d647f..c5be3754b 100644
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -52,7 +52,8 @@ const std::vector<std::string> type_names = {
     "q3_k",
     "q4_k",
     "q5_k",
-    "q6_k"
+    "q6_k",
+    "iq4_nl"
 };
 
 void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py
index bac6ebfb3..15189f717 100644
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -54,6 +54,7 @@ class Metadata:
 
         model_card = Metadata.load_model_card(model_path)
         hf_params = Metadata.load_hf_parameters(model_path)
+        # TODO: load adapter_config.json when possible, it usually contains the base model of the LoRA adapter
 
         # heuristics
         metadata = Metadata.apply_metadata_heuristic(metadata, model_card, hf_params, model_path, total_params)
@@ -177,6 +178,12 @@ class Metadata:
             org_component = None
 
         name_parts: list[str] = model_full_name_component.split('-')
+
+        # Remove empty parts
+        for i in reversed(range(len(name_parts))):
+            if len(name_parts[i]) == 0:
+                del name_parts[i]
+
         name_types: list[
             set[Literal["basename", "size_label", "finetune", "version", "type"]]
         ] = [set() for _ in name_parts]
@@ -223,9 +230,19 @@ class Metadata:
                 name_parts[i] = part
             # Some easy to recognize finetune names
             elif i > 0 and re.fullmatch(r'chat|instruct|vision|lora', part, re.IGNORECASE):
-                name_types[i].add("finetune")
-                if part.lower() == "lora":
-                    name_parts[i] = "LoRA"
+                if total_params < 0 and part.lower() == "lora":
+                    # ignore redundant "lora" in the finetune part when the output is a lora adapter
+                    name_types[i].add("type")
+                else:
+                    name_types[i].add("finetune")
+
+        # Ignore word-based size labels when there is at least a number-based one present
+        # TODO: should word-based size labels always be removed instead?
+        if any(c.isdecimal() for n, t in zip(name_parts, name_types) if "size_label" in t for c in n):
+            for n, t in zip(name_parts, name_types):
+                if "size_label" in t:
+                    if all(c.isalpha() for c in n):
+                        t.remove("size_label")
 
         at_start = True
         # Find the basename through the annotated name
@@ -240,18 +257,18 @@ class Metadata:
 
         # Remove the basename annotation from trailing version
         for part, t in zip(reversed(name_parts), reversed(name_types)):
-            if "basename" in t:
-                if len(t) > 1:
-                    t.remove("basename")
+            if "basename" in t and len(t) > 1:
+                t.remove("basename")
             else:
                 break
 
         basename = "-".join(n for n, t in zip(name_parts, name_types) if "basename" in t) or None
-        size_label = "-".join(s for s, t in zip(name_parts, name_types) if "size_label" in t) or None
+        # Deduplicate size labels using order-preserving 'dict' ('set' seems to sort the keys)
+        size_label = "-".join(dict.fromkeys(s for s, t in zip(name_parts, name_types) if "size_label" in t).keys()) or None
         finetune = "-".join(f for f, t in zip(name_parts, name_types) if "finetune" in t) or None
         # TODO: should the basename version always be excluded?
-        # TODO: should multiple versions be joined together?
-        version = ([v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t] or [None])[-1]
+        # NOTE: multiple finetune versions are joined together
+        version = "-".join(v for v, t, in zip(name_parts, name_types) if "version" in t and "basename" not in t) or None
 
         if size_label is None and finetune is None and version is None:
             # Too ambiguous, output nothing
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
index ef76831b5..40d59b75e 100644
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -50,15 +50,15 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
     # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
 
     if base_name is not None:
-        name = base_name.strip().title().replace(' ', '-').replace('/', '-')
+        name = base_name.strip().replace(' ', '-').replace('/', '-')
     elif model_name is not None:
-        name = model_name.strip().title().replace(' ', '-').replace('/', '-')
+        name = model_name.strip().replace(' ', '-').replace('/', '-')
     else:
         name = "ggml-model"
 
     parameters = f"-{size_label}" if size_label is not None else ""
 
-    finetune = f"-{finetune_string.strip().title().replace(' ', '-')}" if finetune_string is not None else ""
+    finetune = f"-{finetune_string.strip().replace(' ', '-')}" if finetune_string is not None else ""
 
     version = f"-{version_string.strip().replace(' ', '-')}" if version_string is not None else ""
 
diff --git a/gguf-py/scripts/gguf_dump.py b/gguf-py/scripts/gguf_dump.py
index a73ca2776..1b6546541 100755
--- a/gguf-py/scripts/gguf_dump.py
+++ b/gguf-py/scripts/gguf_dump.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 import logging
 import argparse
 import os
+import re
 import sys
 from pathlib import Path
 from typing import Any
@@ -244,26 +245,58 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
         else:
             pretty_type = str(field.types[-1].name)
 
+        def escape_markdown_inline_code(value_string):
+            # Find the longest contiguous sequence of backticks in the string then
+            # wrap string with appropriate number of backticks required to escape it
+            max_backticks = max((len(match.group(0)) for match in re.finditer(r'`+', value_string)), default=0)
+            inline_code_marker = '`' * (max_backticks + 1)
+
+            # If the string starts or ends with a backtick, add a space at the beginning and end
+            if value_string.startswith('`') or value_string.endswith('`'):
+                value_string = f" {value_string} "
+
+            return f"{inline_code_marker}{value_string}{inline_code_marker}"
+
         total_elements = len(field.data)
         value = ""
         if len(field.types) == 1:
             curr_type = field.types[0]
             if curr_type == GGUFValueType.STRING:
-                value = repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])
+                truncate_length = 60
+                value_string = str(bytes(field.parts[-1]), encoding='utf-8')
+                if len(value_string) > truncate_length:
+                    head = escape_markdown_inline_code(value_string[:truncate_length // 2])
+                    tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
+                    value = "{head}...{tail}".format(head=head, tail=tail)
+                else:
+                    value = escape_markdown_inline_code(value_string)
             elif curr_type in reader.gguf_scalar_to_np:
                 value = str(field.parts[-1][0])
         else:
             if field.types[0] == GGUFValueType.ARRAY:
                 curr_type = field.types[1]
+                array_elements = []
+
                 if curr_type == GGUFValueType.STRING:
                     render_element = min(5, total_elements)
                     for element_pos in range(render_element):
-                        value += repr(str(bytes(field.parts[-1 - element_pos]), encoding='utf-8')[:5]) + (", " if total_elements > 1 else "")
+                        truncate_length = 30
+                        value_string = str(bytes(field.parts[-1 - (total_elements - element_pos - 1) * 2]), encoding='utf-8')
+                        if len(value_string) > truncate_length:
+                            head = escape_markdown_inline_code(value_string[:truncate_length // 2])
+                            tail = escape_markdown_inline_code(value_string[-truncate_length // 2:])
+                            value = "{head}...{tail}".format(head=head, tail=tail)
+                        else:
+                            value = escape_markdown_inline_code(value_string)
+                        array_elements.append(value)
+
                 elif curr_type in reader.gguf_scalar_to_np:
                     render_element = min(7, total_elements)
                     for element_pos in range(render_element):
-                        value += str(field.parts[-1 - element_pos][0]) + (", " if total_elements > 1 else "")
-                value = f'[ {value}{" ..." if total_elements > 1 else ""} ]'
+                        array_elements.append(str(field.parts[-1 - (total_elements - element_pos - 1)][0]))
+
+                value = f'[ {", ".join(array_elements).strip()}{", ..." if total_elements > len(array_elements) else ""} ]'
+
         kv_dump_table.append({"n":n, "pretty_type":pretty_type, "total_elements":total_elements, "field_name":field.name, "value":value})
 
     kv_dump_table_header_map = [
@@ -382,7 +415,7 @@ def dump_markdown_metadata(reader: GGUFReader, args: argparse.Namespace) -> None
             markdown_content += f"- Percentage of total elements: {group_percentage:.2f}%\n"
             markdown_content += "\n\n"
 
-        print(markdown_content)  # noqa: NP100
+    print(markdown_content)  # noqa: NP100
 
 
 def main() -> None:
diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py
index 3fac82188..81a2a30ae 100755
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@@ -54,7 +54,7 @@ class TestMetadataMethod(unittest.TestCase):
         self.assertEqual(gguf.Metadata.get_model_id_components("NousResearch/Meta-Llama-3-8B"),
                          ('Meta-Llama-3-8B', "NousResearch", 'Meta-Llama-3', None, None, '8B'))
 
-        # Can't detect all non standard form in a heuristically safe way... best to err in caution and output nothing...
+        # Non standard naming
         self.assertEqual(gguf.Metadata.get_model_id_components("Qwen1.5-MoE-A2.7B-Chat"),
                          ('Qwen1.5-MoE-A2.7B-Chat', None, 'Qwen1.5-MoE', 'Chat', None, 'A2.7B'))
 
@@ -71,7 +71,7 @@ class TestMetadataMethod(unittest.TestCase):
         self.assertEqual(gguf.Metadata.get_model_id_components("delphi-suite/stories-llama2-50k", 50 * 10**3),
                          ('stories-llama2-50k', 'delphi-suite', 'stories-llama2', None, None, '50K'))
 
-        # None standard and not easy to disambiguate
+        # Non standard and not easy to disambiguate
         self.assertEqual(gguf.Metadata.get_model_id_components("DeepSeek-Coder-V2-Lite-Instruct"),
                          ('DeepSeek-Coder-V2-Lite-Instruct', None, 'DeepSeek-Coder-V2-Lite', 'Instruct', None, None))
 
@@ -123,6 +123,51 @@ class TestMetadataMethod(unittest.TestCase):
         self.assertEqual(gguf.Metadata.get_model_id_components("bigscience/bloom-7b1-petals"),
                          ('bloom-7b1-petals', 'bigscience', 'bloom', 'petals', None, '7.1B'))
 
+        # Ignore full-text size labels when there are number-based ones, and deduplicate size labels
+        self.assertEqual(gguf.Metadata.get_model_id_components("MaziyarPanahi/GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1"),
+                         ('GreenNode-mini-7B-multilingual-v1olet-Mistral-7B-Instruct-v0.1', 'MaziyarPanahi', 'GreenNode-mini', 'multilingual-v1olet-Mistral-Instruct', 'v0.1', '7B'))
+
+        # Instruct in a name without a size label
+        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/Mistral-Nemo-Instruct-2407"),
+                         ('Mistral-Nemo-Instruct-2407', 'mistralai', 'Mistral-Nemo', 'Instruct', '2407', None))
+
+        # Non-obvious splitting relying on 'chat' keyword
+        self.assertEqual(gguf.Metadata.get_model_id_components("deepseek-ai/DeepSeek-V2-Chat-0628"),
+                         ('DeepSeek-V2-Chat-0628', 'deepseek-ai', 'DeepSeek-V2', 'Chat', '0628', None))
+
+        # Multiple versions
+        self.assertEqual(gguf.Metadata.get_model_id_components("OpenGVLab/Mini-InternVL-Chat-2B-V1-5"),
+                         ('Mini-InternVL-Chat-2B-V1-5', 'OpenGVLab', 'Mini-InternVL', 'Chat', 'V1-5', '2B'))
+
+        # TODO: DPO in the name
+        self.assertEqual(gguf.Metadata.get_model_id_components("jondurbin/bagel-dpo-2.8b-v0.2"),
+                         ('bagel-dpo-2.8b-v0.2', 'jondurbin', 'bagel-dpo', None, 'v0.2', '2.8B'))
+
+        # DPO in name, but can't be used for the finetune to keep 'LLaMA-3' in the basename
+        self.assertEqual(gguf.Metadata.get_model_id_components("voxmenthe/SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized"),
+                         ('SFR-Iterative-DPO-LLaMA-3-8B-R-unquantized', 'voxmenthe', 'SFR-Iterative-DPO-LLaMA-3', 'R-unquantized', None, '8B'))
+
+        # Too ambiguous
+        # TODO: should "base" be a 'finetune' or 'size_label'?
+        # (in this case it should be a size label, but other models use it to signal that they are not finetuned)
+        self.assertEqual(gguf.Metadata.get_model_id_components("microsoft/Florence-2-base"),
+                         ('Florence-2-base', 'microsoft', None, None, None, None))
+
+        ## Invalid cases ##
+
+        # Start with a dash and has dashes in rows
+        self.assertEqual(gguf.Metadata.get_model_id_components("mistralai/-Mistral--Nemo-Base-2407-"),
+                         ('-Mistral--Nemo-Base-2407-', 'mistralai', 'Mistral-Nemo-Base', None, '2407', None))
+
+        ## LoRA ##
+
+        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
+                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
+
+        # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
+        self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
+                         ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
+
     def test_apply_metadata_heuristic_from_model_card(self):
         model_card = {
             'tags': ['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl'],
@@ -134,7 +179,7 @@ class TestMetadataMethod(unittest.TestCase):
         }
         got = gguf.Metadata.apply_metadata_heuristic(gguf.Metadata(), model_card, None, None)
         expect = gguf.Metadata()
-        expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': 'v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
+        expect.base_models=[{'name': 'Mistral 7B Merge 14 v0', 'organization': 'EmbeddedLLM', 'version': '14-v0', 'repo_url': 'https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0'}, {'name': 'Trinity v1', 'organization': 'Janai Hq', 'version': 'v1', 'repo_url': 'https://huggingface.co/janai-hq/trinity-v1'}]
         expect.tags=['Llama-3', 'instruct', 'finetune', 'chatml', 'DPO', 'RLHF', 'gpt4', 'synthetic data', 'distillation', 'function calling', 'json mode', 'axolotl']
         expect.languages=['en']
         expect.datasets=['teknium/OpenHermes-2.5']
diff --git a/include/llama.h b/include/llama.h
index b280df325..413070d95 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -92,6 +92,9 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
+        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
+        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
     };
 
     // note: these values should be synchronized with ggml_rope
@@ -526,12 +529,16 @@ extern "C" {
             struct llama_lora_adapter * adapter,
             float scale);
 
-    // Remove a LoRA adapter from given context
+    // Remove a specific LoRA adapter from given context
     // Return -1 if the adapter is not present in the context
     LLAMA_API int32_t llama_lora_adapter_remove(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter);
 
+    // Remove all LoRA adapters from given context
+    LLAMA_API void llama_lora_adapter_clear(
+            struct llama_context * ctx);
+
     // Manually free a LoRA adapter
     // Note: loaded adapters will be free when the associated model is deleted
     LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
@@ -903,10 +910,10 @@ extern "C" {
     LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
 
     // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
+    LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
 
     // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
+    LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
 
     // Codellama infill tokens
     LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@@ -962,6 +969,10 @@ extern "C" {
                             bool   remove_special,
                             bool   unparse_special);
 
+    //
+    // Chat templates
+    //
+
     /// Apply chat template. Inspired by hf apply_chat_template() on python.
     /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
     /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@@ -1000,6 +1011,23 @@ extern "C" {
 
     LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
 
+    /// @details Apply constraints from grammar
+    LLAMA_API void llama_grammar_sample(
+            const struct llama_grammar * grammar,
+            const struct llama_context * ctx,
+                llama_token_data_array * candidates);
+    LLAMA_API DEPRECATED(void llama_sample_grammar(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+      const struct llama_grammar * grammar),
+        "use llama_grammar_sample instead");
+
+    /// @details Accepts the sampled token into the grammar
+    LLAMA_API void llama_grammar_accept_token(
+            struct llama_grammar * grammar,
+            struct llama_context * ctx,
+                     llama_token   token);
+
     //
     // Sampling functions
     //
@@ -1081,12 +1109,6 @@ extern "C" {
           llama_token_data_array * candidates,
                            float   temp);
 
-    /// @details Apply constraints from grammar
-    LLAMA_API void llama_sample_grammar(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-      const struct llama_grammar * grammar);
-
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1124,12 +1146,6 @@ extern "C" {
             struct llama_context * ctx,
           llama_token_data_array * candidates);
 
-    /// @details Accepts the sampled token into the grammar
-    LLAMA_API void llama_grammar_accept_token(
-            struct llama_context * ctx,
-            struct llama_grammar * grammar,
-                     llama_token   token);
-
     //
     // Model split
     //
@@ -1172,38 +1188,45 @@ extern "C" {
 
 struct ggml_tensor;
 
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
+    struct llama_context * ctx
+);
+
 struct llama_partial_utf8 {
     uint32_t value;    // bit value so far (unshifted)
     int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 
-struct llama_grammar {
-    const std::vector<std::vector<llama_grammar_element>>   rules;
-    std::vector<std::vector<const llama_grammar_element *>> stacks;
-
-    // buffer for partially generated UTF-8 sequence from accepted tokens
-    llama_partial_utf8                                      partial_utf8;
-};
-
 struct llama_grammar_candidate {
     size_t               index;
     const uint32_t     * code_points;
     llama_partial_utf8   partial_utf8;
 };
 
-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-);
+using llama_grammar_rule  = std::vector<      llama_grammar_element>;
+using llama_grammar_stack = std::vector<const llama_grammar_element *>;
+
+using llama_grammar_rules      = std::vector<llama_grammar_rule>;
+using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
+using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
+
+const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
+      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
 
 void llama_grammar_accept(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const uint32_t                                                  chr,
-        std::vector<std::vector<const llama_grammar_element *>>       & new_stacks);
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stacks & stacks,
+        const uint32_t chr,
+              llama_grammar_stacks & new_stacks);
+
+std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
+        const llama_grammar_rules & rules,
+        const llama_grammar_stack & stack,
+        const llama_grammar_candidates & candidates);
 
 std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
         const std::string & src,
-        llama_partial_utf8   partial_start);
+        llama_partial_utf8 partial_start);
 
 // Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
 // This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
diff --git a/models/ggml-vocab-gpt2.gguf b/models/ggml-vocab-gpt2.gguf
deleted file mode 100644
index 1fbc72c1e..000000000
Binary files a/models/ggml-vocab-gpt2.gguf and /dev/null differ
diff --git a/models/ggml-vocab-stablelm.gguf b/models/ggml-vocab-stablelm.gguf
deleted file mode 100644
index ebb0cdb7d..000000000
Binary files a/models/ggml-vocab-stablelm.gguf and /dev/null differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c2049df79..46a6ad562 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,6 +14,9 @@ endif()
 add_library(llama
             ../include/llama.h
             llama.cpp
+            llama-vocab.cpp
+            llama-grammar.cpp
+            llama-sampling.cpp
             unicode.h
             unicode.cpp
             unicode-data.cpp
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
new file mode 100644
index 000000000..bd9322e2f
--- /dev/null
+++ b/src/llama-grammar.cpp
@@ -0,0 +1,539 @@
+#include "llama-grammar.h"
+
+#include "llama-vocab.h"
+#include "llama-sampling.h"
+
+#include <algorithm>
+
+// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
+// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
+std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
+        const std::string & src,
+        llama_partial_utf8 partial_start) {
+    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
+    const char          * pos      = src.c_str();
+    std::vector<uint32_t> code_points;
+
+    // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
+    code_points.reserve(src.size() + 1);
+    uint32_t value    = partial_start.value;
+    int      n_remain = partial_start.n_remain;
+
+    // continue previous decode, if applicable
+    while (*pos != 0 && n_remain > 0) {
+        uint8_t next_byte = static_cast<uint8_t>(*pos);
+        if ((next_byte >> 6) != 2) {
+            // invalid sequence, abort
+            code_points.push_back(0);
+            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
+        }
+        value = (value << 6) + (next_byte & 0x3F);
+        ++pos;
+        --n_remain;
+    }
+
+    if (partial_start.n_remain > 0 && n_remain == 0) {
+        code_points.push_back(value);
+    }
+
+    // decode any subsequent utf-8 sequences, which may end in an incomplete one
+    while (*pos != 0) {
+        uint8_t first_byte = static_cast<uint8_t>(*pos);
+        uint8_t highbits   = first_byte >> 4;
+                n_remain   = lookup[highbits] - 1;
+
+        if (n_remain < 0) {
+            // invalid sequence, abort
+            code_points.clear();
+            code_points.push_back(0);
+            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
+        }
+
+        uint8_t mask  = (1 << (7 - n_remain)) - 1;
+                value = first_byte & mask;
+
+        ++pos;
+        while (*pos != 0 && n_remain > 0) {
+            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+            ++pos;
+            --n_remain;
+        }
+        if (n_remain == 0) {
+            code_points.push_back(value);
+        }
+    }
+    code_points.push_back(0);
+
+    return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
+}
+
+const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) {
+    return grammar->rules;
+}
+
+llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) {
+    return grammar->stacks;
+}
+
+// returns true iff pos points to the end of one of the definitions of a rule
+static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
+    switch (pos->type) {
+        case LLAMA_GRETYPE_END: return true;  // NOLINT
+        case LLAMA_GRETYPE_ALT: return true;  // NOLINT
+        default:                return false;
+    }
+}
+
+// returns true iff chr satisfies the char range at pos (regular or inverse range)
+// asserts that pos is pointing to a char range element
+static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
+        const llama_grammar_element * pos,
+        const uint32_t                chr) {
+
+    bool found            = false;
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
+
+    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
+
+    do {
+        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+            // inclusive range, e.g. [a-z]
+            found = found || (pos->value <= chr && chr <= pos[1].value);
+            pos += 2;
+        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+            // Any character matches "."
+            found = true;
+            pos += 1;
+        } else {
+            // exact char match, e.g. [a] or "a"
+            found = found || pos->value == chr;
+            pos += 1;
+        }
+    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+
+    return std::make_pair(found == is_positive_char, pos);
+}
+
+// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
+// range at pos (regular or inverse range)
+// asserts that pos is pointing to a char range element
+static bool llama_grammar_match_partial_char(
+        const llama_grammar_element * pos,
+        const llama_partial_utf8      partial_utf8) {
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
+    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
+
+    uint32_t partial_value = partial_utf8.value;
+    int      n_remain      = partial_utf8.n_remain;
+
+    // invalid sequence or 7-bit char split across 2 bytes (overlong)
+    if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
+        return false;
+    }
+
+    // range of possible code points this partial UTF-8 sequence could complete to
+    uint32_t low  = partial_value << (n_remain * 6);
+    uint32_t high = low | ((1 << (n_remain * 6)) - 1);
+
+    if (low == 0) {
+        if (n_remain == 2) {
+            low = 1 << 11;
+        } else if (n_remain == 3) {
+            low = 1 << 16;
+        }
+    }
+
+    do {
+        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+            // inclusive range, e.g. [a-z]
+            if (pos->value <= high && low <= pos[1].value) {
+                return is_positive_char;
+            }
+            pos += 2;
+        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+            // Any character matches "."
+            return true;
+        } else {
+            // exact char match, e.g. [a] or "a"
+            if (low <= pos->value && pos->value <= high) {
+                return is_positive_char;
+            }
+            pos += 1;
+        }
+    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
+
+    return !is_positive_char;
+}
+
+// transforms a grammar pushdown stack into N possible stacks, all ending
+// at a character range (terminal element)
+static void llama_grammar_advance_stack(
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stack  & stack,
+              llama_grammar_stacks & new_stacks) {
+    if (stack.empty()) {
+        if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+            new_stacks.emplace_back(stack);
+        }
+        return;
+    }
+
+    const llama_grammar_element * pos = stack.back();
+
+    switch (pos->type) {
+        case LLAMA_GRETYPE_RULE_REF: {
+            const size_t                  rule_id = static_cast<size_t>(pos->value);
+            const llama_grammar_element * subpos  = rules[rule_id].data();
+            do {
+                // init new stack without the top (pos)
+                llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
+                    // if this rule ref is followed by another element, add that to stack
+                    new_stack.push_back(pos + 1);
+                }
+                if (!llama_grammar_is_end_of_sequence(subpos)) {
+                    // if alternate is nonempty, add to stack
+                    new_stack.push_back(subpos);
+                }
+                llama_grammar_advance_stack(rules, new_stack, new_stacks);
+                while (!llama_grammar_is_end_of_sequence(subpos)) {
+                    // scan to end of alternate def
+                    subpos++;
+                }
+                if (subpos->type == LLAMA_GRETYPE_ALT) {
+                    // there's another alternate def of this rule to process
+                    subpos++;
+                } else {
+                    break;
+                }
+            } while (true);
+            break;
+        }
+        case LLAMA_GRETYPE_CHAR:
+        case LLAMA_GRETYPE_CHAR_NOT:
+        case LLAMA_GRETYPE_CHAR_ANY:
+            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+                // only add the stack if it's not a duplicate of one we already have
+                new_stacks.emplace_back(stack);
+            }
+            break;
+        default:
+            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
+            // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
+            // those
+            GGML_ASSERT(false);
+    }
+}
+
+// takes a set of possible pushdown stacks on a grammar, which are required to
+// be positioned at a character range (see `llama_grammar_advance_stack`), and
+// produces the N possible stacks if the given char is accepted at those
+// positions
+void llama_grammar_accept(
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stacks & stacks,
+        const uint32_t               chr,
+              llama_grammar_stacks & new_stacks) {
+    new_stacks.clear();
+
+    for (const auto & stack : stacks) {
+        if (stack.empty()) {
+            continue;
+        }
+
+        auto match = llama_grammar_match_char(stack.back(), chr);
+        if (match.first) {
+            const llama_grammar_element * pos = match.second;
+
+            // update top of stack to next element, if any
+            llama_grammar_stack new_stack(stack.begin(), stack.end() - 1);
+            if (!llama_grammar_is_end_of_sequence(pos)) {
+                new_stack.push_back(pos);
+            }
+            llama_grammar_advance_stack(rules, new_stack, new_stacks);
+        }
+    }
+}
+
+static llama_grammar_candidates llama_grammar_reject_candidates(
+        const llama_grammar_rules  & rules,
+        const llama_grammar_stacks & stacks,
+        const llama_grammar_candidates & candidates) {
+    GGML_ASSERT(!stacks.empty()); // REVIEW
+
+    if (candidates.empty()) {
+        return {};
+    }
+
+    auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
+
+    for (size_t i = 1, size = stacks.size(); i < size; ++i) {
+        rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
+    }
+    return rejects;
+}
+
+llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
+        const llama_grammar_rules      & rules,
+        const llama_grammar_stack      & stack,
+        const llama_grammar_candidates & candidates) {
+
+    llama_grammar_candidates rejects;
+    rejects.reserve(candidates.size());
+
+    if (stack.empty()) {
+        for (const auto & tok : candidates) {
+            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
+                rejects.push_back(tok);
+            }
+        }
+        return rejects;
+    }
+
+    const llama_grammar_element * stack_pos = stack.back();
+
+    llama_grammar_candidates next_candidates;
+    next_candidates.reserve(candidates.size());
+
+    for (const auto & tok : candidates) {
+        if (*tok.code_points == 0) {
+            // reached end of full codepoints in token, reject iff it ended in a partial sequence
+            // that cannot satisfy this position in grammar
+            if (tok.partial_utf8.n_remain != 0 &&
+                    !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
+                rejects.push_back(tok);
+            }
+        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
+            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
+        } else {
+            rejects.push_back(tok);
+        }
+    }
+
+    const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
+
+    // update top of stack to next element, if any
+    llama_grammar_stack stack_after(stack.begin(), stack.end() - 1);
+    if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
+        stack_after.push_back(stack_pos_after);
+    }
+    llama_grammar_stacks next_stacks;
+    llama_grammar_advance_stack(rules, stack_after, next_stacks);
+
+    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
+    for (const auto & tok : next_rejects) {
+        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
+    }
+
+    return rejects;
+}
+
+static bool llama_grammar_detect_left_recursion(
+        const llama_grammar_rules & rules,
+        size_t rule_index,
+        std::vector<bool> * rules_visited,
+        std::vector<bool> * rules_in_progress,
+        std::vector<bool> * rules_may_be_empty) {
+    if ((*rules_in_progress)[rule_index]) {
+        return true;
+    }
+
+    (*rules_in_progress)[rule_index] = true;
+
+    const llama_grammar_rule & rule = rules[rule_index];
+
+    // First check if the rule might produce the empty string. This could be done combined with the second
+    // step but it's more readable as two steps.
+    bool at_rule_start = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            if (at_rule_start) {
+                (*rules_may_be_empty)[rule_index] = true;
+                break;
+            }
+            at_rule_start = true;
+        } else {
+            at_rule_start = false;
+        }
+    }
+
+    // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
+    // be empty)
+    bool recurse_into_nonterminal = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
+            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
+                return true;
+            }
+            if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
+                recurse_into_nonterminal = false;
+            }
+        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            recurse_into_nonterminal = true;
+        } else {
+            recurse_into_nonterminal = false;
+        }
+    }
+
+    (*rules_in_progress)[rule_index] = false;
+    (*rules_visited)[rule_index] = true;
+    return false;
+}
+
+//
+// grammar - external
+//
+
+struct llama_grammar * llama_grammar_init_impl(
+            const llama_grammar_element ** rules,
+                                 size_t    n_rules,
+                                 size_t    start_rule_index) {
+    const llama_grammar_element * pos;
+
+    // copy rule definitions into vectors
+    llama_grammar_rules vec_rules(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
+            vec_rules[i].push_back(*pos);
+        }
+        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+    }
+
+    // Check for left recursion
+    std::vector<bool> rules_visited(n_rules);
+    std::vector<bool> rules_in_progress(n_rules);
+    std::vector<bool> rules_may_be_empty(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        if (rules_visited[i]) {
+            continue;
+        }
+        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
+            return nullptr;
+        }
+    }
+
+    // loop over alternates of start rule to build initial stacks
+    llama_grammar_stacks stacks;
+    pos = vec_rules[start_rule_index].data();
+    do {
+        llama_grammar_stack stack;
+        if (!llama_grammar_is_end_of_sequence(pos)) {
+            // if alternate is nonempty, add to stack
+            stack.push_back(pos);
+        }
+        llama_grammar_advance_stack(vec_rules, stack, stacks);
+        while (!llama_grammar_is_end_of_sequence(pos)) {
+            // scan to end of alternate def
+            pos++;
+        }
+        if (pos->type == LLAMA_GRETYPE_ALT) {
+            // there's another alternate def of this rule to process
+            pos++;
+        } else {
+            break;
+        }
+    } while (true);
+
+    // Important: vec_rules has to be moved here, not copied, because stacks contains
+    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // then the pointers would be invalidated when the local vec_rules goes out of scope.
+    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
+}
+
+void llama_grammar_free_impl(struct llama_grammar * grammar) {
+    delete grammar;
+}
+
+struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar) {
+    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
+
+    // redirect elements in stacks to point to new rules
+    for (size_t is = 0; is < result->stacks.size(); is++) {
+        for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
+            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
+                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
+                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
+                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
+                    }
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    GGML_ASSERT(grammar);
+    GGML_ASSERT(vocab);
+
+    int64_t t_start_sample_us = ggml_time_us();
+
+    bool allow_eog = false;
+    for (const auto & stack : grammar->stacks) {
+        if (stack.empty()) {
+            allow_eog = true;
+            break;
+        }
+    }
+
+    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
+    candidates_decoded.reserve(candidates->size);
+
+    llama_grammar_candidates candidates_grammar;
+    candidates_grammar.reserve(candidates->size);
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        const llama_token id      = candidates->data[i].id;
+        const std::string & piece = vocab->cache_token_to_piece.at(id);
+
+        if (llama_token_is_eog_impl(*vocab, id)) {
+            if (!allow_eog) {
+                candidates->data[i].logit = -INFINITY;
+            }
+        } else if (piece.empty() || piece[0] == 0) {
+            candidates->data[i].logit = -INFINITY;
+        } else {
+            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
+            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
+        }
+    }
+
+    const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
+    for (const auto & reject : rejects) {
+        candidates->data[reject.index].logit = -INFINITY;
+    }
+
+    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+}
+
+void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) {
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    if (llama_token_is_eog_impl(*vocab, token)) {
+        for (const auto & stack : grammar->stacks) {
+            if (stack.empty()) {
+                return;
+            }
+        }
+        GGML_ASSERT(false);
+    }
+
+    const std::string & piece = vocab->cache_token_to_piece.at(token);
+
+    // Note terminating 0 in decoded string
+    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
+    const auto & code_points = decoded.first;
+
+    llama_grammar_stacks tmp_new_stacks;
+    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+        llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
+        grammar->stacks = tmp_new_stacks;
+    }
+
+    grammar->partial_utf8 = decoded.second;
+    GGML_ASSERT(!grammar->stacks.empty());
+
+    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+}
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
new file mode 100644
index 000000000..695ea0632
--- /dev/null
+++ b/src/llama-grammar.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "llama-impl.h"
+
+struct llama_vocab;
+struct llama_sampling;
+
+struct llama_grammar {
+    const llama_grammar_rules  rules;
+          llama_grammar_stacks stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8 partial_utf8;
+};
+
+//
+// internal API
+//
+
+struct llama_grammar * llama_grammar_init_impl(
+            const llama_grammar_element ** rules,
+                                 size_t    n_rules,
+                                 size_t    start_rule_index);
+
+void llama_grammar_free_impl(struct llama_grammar * grammar);
+
+struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar);
+
+void llama_grammar_sample_impl(
+        const struct llama_grammar * grammar,
+          const struct llama_vocab * vocab,
+       const struct llama_sampling * smpl,
+            llama_token_data_array * candidates);
+
+void llama_grammar_accept_token_impl(
+              struct llama_grammar * grammar,
+          const struct llama_vocab * vocab,
+       const struct llama_sampling * smpl,
+                       llama_token   token);
diff --git a/src/llama-impl.h b/src/llama-impl.h
new file mode 100644
index 000000000..dcc8c1c15
--- /dev/null
+++ b/src/llama-impl.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#define LLAMA_API_INTERNAL
+#include "llama.h"
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
+//
+// logging
+//
+
+LLAMA_ATTRIBUTE_FORMAT(2, 3)
+void llama_log_internal        (ggml_log_level level, const char * format, ...);
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+
+#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
new file mode 100644
index 000000000..8910f6d65
--- /dev/null
+++ b/src/llama-sampling.cpp
@@ -0,0 +1,635 @@
+#include "llama-sampling.h"
+
+#include <algorithm>
+#include <cstring>
+#include <ctime>
+#include <cfloat>
+#include <numeric>
+#include <unordered_map>
+
+static void llama_log_softmax(float * array, size_t size) {
+    float max_l = *std::max_element(array, array + size);
+    float sum = 0.f;
+    for (size_t i = 0; i < size; ++i) {
+        float p = expf(array[i] - max_l);
+        sum += p;
+        array[i] = p;
+    }
+
+    for (size_t i = 0; i < size; ++i) {
+        array[i] = logf(array[i] / sum);
+    }
+}
+
+void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = time(NULL);
+    }
+
+    smpl->rng.seed(seed);
+}
+
+void llama_sample_softmax_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    GGML_ASSERT(candidates->size > 0);
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Sort the logits in descending order
+    if (!candidates->sorted) {
+        std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        });
+        candidates->sorted = true;
+    }
+
+    float max_l = candidates->data[0].logit;
+    float cum_sum = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float p = expf(candidates->data[i].logit - max_l);
+        candidates->data[i].p = p;
+        cum_sum += p;
+    }
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].p /= cum_sum;
+    }
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
+    // if (k >= (int32_t)candidates->size) {
+    //     return;
+    // }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    if (k <= 0) {
+        k = candidates->size;
+    }
+
+    k = std::max(k, (int) min_keep);
+    k = std::min(k, (int) candidates->size);
+
+    // Sort scores in descending order
+    if (!candidates->sorted) {
+        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        };
+        if (k <= 128) {
+            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
+        } else {
+            constexpr int   nbuckets     = 128;
+            constexpr float bucket_low   = -10.0f;
+            constexpr float bucket_high  =  10.0f;
+            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
+            constexpr float bucker_inter = -bucket_low * bucket_scale;
+
+            std::vector<int> bucket_idx(candidates->size);
+            std::vector<int> histo(nbuckets, 0);
+
+            for (int i = 0; i < (int)candidates->size; ++i) {
+                const float val = candidates->data[i].logit;
+                int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+                ib = std::max(0, std::min(nbuckets-1, ib));
+                bucket_idx[i] = ib;
+                ++histo[ib];
+            }
+            int nhave = 0;
+            int ib = nbuckets - 1;
+            for ( ; ib >= 0; --ib) {
+                nhave += histo[ib];
+                if (nhave >= k) break;
+            }
+            std::vector<llama_token_data> tmp_tokens(nhave);
+            auto ptr = tmp_tokens.data();
+            std::vector<llama_token_data*> bucket_ptrs;
+            bucket_ptrs.reserve(nbuckets - ib);
+            for (int j = nbuckets - 1; j >= ib; --j) {
+                bucket_ptrs.push_back(ptr);
+                ptr += histo[j];
+            }
+            for (int i = 0; i < (int)candidates->size; ++i) {
+                int j = bucket_idx[i];
+                if (j >= ib) {
+                    *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
+                }
+            }
+
+            ptr = tmp_tokens.data();
+            int ndone = 0;
+            for (int j = nbuckets-1; j > ib; --j) {
+                std::sort(ptr, ptr + histo[j], comp);
+                ptr += histo[j];
+                ndone += histo[j];
+            }
+            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
+
+            std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
+
+        }
+        candidates->sorted = true;
+    }
+    candidates->size = k;
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    if (p >= 1.0f) {
+        return;
+    }
+
+    llama_sample_softmax_impl(smpl, candidates);
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = candidates->size;
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        cum_sum += candidates->data[i].p;
+
+        // Check if the running sum is at least p or if we have kept at least min_keep tokens
+        // we set the last index to i+1 to indicate that the current iterate should be included in the set
+        if (cum_sum >= p && i + 1 >= min_keep) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the top-p tokens
+    candidates->size = last_idx;
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    if (p <= 0.0f || !candidates->size) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    bool min_p_applied = false;
+
+    // if the candidates aren't sorted, try the unsorted implementation first
+    if (!candidates->sorted) {
+        std::vector<llama_token_data> filtered_tokens;
+
+        float max_logit = -FLT_MAX;
+        for (size_t i = 0; i < candidates->size; ++i) {
+            max_logit = std::max(max_logit, candidates->data[i].logit);
+        }
+        const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
+
+        for (size_t i = 0; i < candidates->size; ++i) {
+            if (candidates->data[i].logit >= min_logit) {
+                filtered_tokens.push_back(candidates->data[i]);
+            }
+        }
+
+        // if we have enough values the operation was a success
+        if (filtered_tokens.size() >= min_keep) {
+            memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
+            candidates->size = filtered_tokens.size();
+            min_p_applied = true;
+        }
+    }
+
+    // if the candidates are sorted or the unsorted implementation failed, use this implementation
+    if (!min_p_applied) {
+        // Sort the logits in descending order
+        if (!candidates->sorted) {
+            std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+                return a.logit > b.logit;
+            });
+            candidates->sorted = true;
+        }
+
+        const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
+        size_t i = 1; // first token always matches
+
+        for (; i < candidates->size; ++i) {
+            if (candidates->data[i].logit < min_logit && i >= min_keep) {
+                break; // prob too small
+            }
+        }
+
+        // Resize the output vector to keep only the matching tokens
+        candidates->size = i;
+    }
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) {
+    if (z >= 1.0f || candidates->size <= 2) {
+        return;
+    }
+
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Compute the first and second derivatives
+    std::vector<float> first_derivatives(candidates->size - 1);
+    std::vector<float> second_derivatives(candidates->size - 2);
+
+    for (size_t i = 0; i < first_derivatives.size(); ++i) {
+        first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
+    }
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
+    }
+
+    // Calculate absolute value of second derivatives
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = std::abs(second_derivatives[i]);
+    }
+
+    // Normalize the second derivatives
+    {
+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+
+        if (second_derivatives_sum > 1e-6f) {
+            for (float & value : second_derivatives) {
+                value /= second_derivatives_sum;
+            }
+        } else {
+            for (float & value : second_derivatives) {
+                value = 1.0f / second_derivatives.size();
+            }
+        }
+    }
+
+    float cum_sum = 0.0f;
+    size_t last_idx = candidates->size;
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        cum_sum += second_derivatives[i];
+
+        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
+        if (cum_sum > z && i >= min_keep) {
+            last_idx = i;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the tokens above the tail location
+    candidates->size = last_idx;
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) {
+    // Reference implementation:
+    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+    if (p >= 1.0f) {
+        return;
+    }
+
+    // Compute the softmax of logits and calculate entropy
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    float entropy = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        entropy += -candidates->data[i].p * logf(candidates->data[i].p);
+    }
+
+    // Compute the absolute difference between negative log probability and entropy for each candidate
+    std::vector<float> shifted_scores;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
+        shifted_scores.push_back(shifted_score);
+    }
+
+    // Sort tokens based on the shifted_scores and their corresponding indices
+    std::vector<size_t> indices(candidates->size);
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+        return shifted_scores[a] < shifted_scores[b];
+    });
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = indices.size();
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        cum_sum += candidates->data[idx].p;
+
+        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
+        if (cum_sum > p && i >= min_keep - 1) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the locally typical tokens
+    std::vector<llama_token_data> new_candidates;
+    for (size_t i = 0; i < last_idx; ++i) {
+        size_t idx = indices[i];
+        new_candidates.push_back(candidates->data[idx]);
+    }
+
+    // Replace the data in candidates with the new_candidates data
+    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
+    candidates->size = new_candidates.size();
+    candidates->sorted = false;
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_entropy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) {
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // no need to do anything if there is only one (or zero) candidates
+    if(candidates->size <= 1) {
+        return;
+    }
+
+    // Calculate maximum possible entropy
+    float max_entropy = -logf(1.0f / candidates->size);
+
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+
+    // Calculate entropy of the softmax probabilities
+    float entropy = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float prob = candidates->data[i].p;
+        if (prob > 0.0f) { // Ensure no log(0)
+            entropy -= prob * logf(prob);
+        }
+    }
+
+    // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates->size != 1 above)
+    float normalized_entropy = entropy / max_entropy;
+
+    // Map the normalized entropy to the desired temperature range using the power function
+    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
+
+#ifdef DEBUG
+    LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
+    LLAMA_LOG_INFO("Entropy: %f\n", entropy);
+    LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
+    LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
+    LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
+    LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
+#endif
+
+    // Apply the dynamically calculated temperature scaling
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].logit /= dyn_temp;
+    }
+
+    // Re-compute softmax probabilities after scaling logits with dynamic temperature
+    double max_l_double = candidates->data[0].logit;
+    double cum_sum_double = 0.0;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        double p = exp(candidates->data[i].logit - max_l_double);
+        candidates->data[i].p = p; // Store the scaled probability
+        cum_sum_double += p;
+    }
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].p /= cum_sum_double; // Re-normalize the probabilities
+    }
+
+#ifdef DEBUG
+    // Print the updated top 25 probabilities after temperature scaling
+    LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
+    for (size_t i = 0; i < 25 && i < candidates->size; ++i) {
+        LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates->data[i].p * 100.0f);
+    }
+#endif
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) {
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].logit /= temp;
+    }
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_repetition_penalties_impl(
+        struct llama_sampling * smpl,
+       llama_token_data_array * candidates,
+            const llama_token * last_tokens,
+                       size_t   penalty_last_n,
+                       float   penalty_repeat,
+                       float   penalty_freq,
+                       float   penalty_present) {
+    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Create a frequency map to count occurrences of each token in last_tokens
+    std::unordered_map<llama_token, int> token_count;
+    for (size_t i = 0; i < penalty_last_n; ++i) {
+        token_count[last_tokens[i]]++;
+    }
+
+    // Apply frequency and presence penalties to the candidates
+    for (size_t i = 0; i < candidates->size; ++i) {
+        const auto token_iter = token_count.find(candidates->data[i].id);
+        if (token_iter == token_count.end()) {
+            continue;
+        }
+
+        const int count = token_iter->second;
+
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (candidates->data[i].logit <= 0) {
+            candidates->data[i].logit *= penalty_repeat;
+        } else {
+            candidates->data[i].logit /= penalty_repeat;
+        }
+
+        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
+    }
+
+    candidates->sorted = false;
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_apply_guidance_impl(
+        struct llama_sampling * smpl,
+                        float * logits,
+                        float * logits_guidance,
+                        float   scale) {
+    GGML_ASSERT(smpl);
+
+    const auto t_start_sample_us = ggml_time_us();
+    const auto n_vocab = smpl->n_vocab;
+
+    llama_log_softmax(logits, n_vocab);
+    llama_log_softmax(logits_guidance, n_vocab);
+
+    for (int i = 0; i < n_vocab; ++i) {
+              auto & l = logits[i];
+        const auto & g = logits_guidance[i];
+
+        l = scale * (l - g) + g;
+    }
+
+    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+}
+
+llama_token llama_sample_token_mirostat_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
+    GGML_ASSERT(smpl);
+
+    const int32_t n_vocab = float(smpl->n_vocab);
+
+    int64_t t_start_sample_us = ggml_time_us();
+
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+
+    // Estimate s_hat using the most probable m tokens
+    float s_hat = 0.0;
+    float sum_ti_bi = 0.0;
+    float sum_ti_sq = 0.0;
+    for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
+        float t_i = logf(float(i + 2) / float(i + 1));
+        float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
+        sum_ti_bi += t_i * b_i;
+        sum_ti_sq += t_i * t_i;
+    }
+    s_hat = sum_ti_bi / sum_ti_sq;
+
+    // Compute k from the estimated s_hat and target surprise value
+    float epsilon_hat = s_hat - 1;
+    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat);
+
+    // Sample the next word X using top-k sampling
+    llama_sample_top_k_impl((struct llama_sampling *) nullptr, candidates, int(k), 1);
+    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    llama_token X = llama_sample_token_impl(smpl, candidates);
+    t_start_sample_us = ggml_time_us();
+
+    // Compute error as the difference between observed surprise and target surprise value
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return candidate.id == X;
+    }));
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
+    float e = observed_surprise - tau;
+
+    // Update mu using the learning rate and error
+    *mu = *mu - eta * e;
+
+    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    return X;
+}
+
+llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) {
+    int64_t t_start_sample_us;
+    t_start_sample_us = ggml_time_us();
+
+    llama_sample_softmax_impl(smpl, candidates);
+
+    // Truncate the words with surprise values greater than mu
+    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return -log2f(candidate.p) > *mu;
+    }));
+
+    if (candidates->size == 0) {
+        candidates->size = 1;
+    }
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+
+    // Normalize the probabilities of the remaining words
+    llama_sample_softmax_impl(smpl, candidates);
+
+    // Sample the next word X from the remaining words
+    llama_token X = llama_sample_token_impl(smpl, candidates);
+    t_start_sample_us = ggml_time_us();
+
+    // Compute error as the difference between observed surprise and target surprise value
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return candidate.id == X;
+    }));
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
+    float e = observed_surprise - tau;
+
+    // Update mu using the learning rate and error
+    *mu = *mu - eta * e;
+
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+    return X;
+}
+
+llama_token llama_sample_token_greedy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Find max element
+    auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit < b.logit;
+    });
+
+    llama_token result = max_iter->id;
+    if (smpl) {
+        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+        smpl->n_sample++;
+    }
+    return result;
+}
+
+llama_token llama_sample_token_with_rng_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) {
+    GGML_ASSERT(smpl);
+
+    const int64_t t_start_sample_us = ggml_time_us();
+    llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates);
+
+    std::vector<float> probs;
+    probs.reserve(candidates->size);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        probs.push_back(candidates->data[i].p);
+    }
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    llama_token result = candidates->data[idx].id;
+
+    smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
+    smpl->n_sample++;
+
+    return result;
+}
+
+llama_token llama_sample_token_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) {
+    return llama_sample_token_with_rng_impl(smpl, candidates, smpl->rng);
+}
diff --git a/src/llama-sampling.h b/src/llama-sampling.h
new file mode 100644
index 000000000..f7f8e3ef7
--- /dev/null
+++ b/src/llama-sampling.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "llama-impl.h"
+
+struct llama_sampling {
+    llama_sampling(int32_t n_vocab) : n_vocab(n_vocab) {}
+
+    std::mt19937 rng;
+
+    int32_t n_vocab = 0;
+
+    mutable int64_t t_sample_us = 0;
+    mutable int32_t n_sample = 0;
+
+    void reset_timings() const {
+        t_sample_us = 0;
+        n_sample = 0;
+    }
+};
+
+//
+// internal API
+//
+
+void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed);
+
+void llama_sample_softmax_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates);
+void llama_sample_top_k_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep);
+void llama_sample_top_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_min_p_impl    (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep);
+void llama_sample_typical_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep);
+void llama_sample_entropy_impl  (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val);
+void llama_sample_temp_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp);
+
+void llama_sample_repetition_penalties_impl(
+        struct llama_sampling * smpl,
+       llama_token_data_array * candidates,
+            const llama_token * last_tokens,
+                       size_t   penalty_last_n,
+                        float   penalty_repeat,
+                        float   penalty_freq,
+                        float   penalty_present);
+
+void llama_sample_apply_guidance_impl(
+        struct llama_sampling * smpl,
+                        float * logits,
+                        float * logits_guidance,
+                        float   scale);
+
+llama_token llama_sample_token_mirostat_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu);
+llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu);
+llama_token llama_sample_token_greedy_impl     (struct llama_sampling * smpl, llama_token_data_array * candidates);
+llama_token llama_sample_token_with_rng_impl   (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng);
+llama_token llama_sample_token_impl            (struct llama_sampling * smpl, llama_token_data_array * candidates);
+
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
new file mode 100644
index 000000000..c482b3689
--- /dev/null
+++ b/src/llama-vocab.cpp
@@ -0,0 +1,1721 @@
+#include "llama-vocab.h"
+
+#include "unicode.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cfloat>
+#include <climits>
+#include <cstdarg>
+#include <cstring>
+#include <forward_list>
+#include <queue>
+#include <sstream>
+
+//
+// helpers
+//
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+LLAMA_ATTRIBUTE_FORMAT(1, 2)
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct naive_trie {
+    naive_trie() : has_value(false), value(0) {
+    }
+    void insert(const char * key, size_t len, int32_t value = 0) {
+        if (len == 0) {
+            this->has_value = true;
+            this->value = value;
+            return;
+        }
+        char c = key[0];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            res->second.insert(key + 1, len - 1, value);
+        } else {
+            auto res = children.insert(std::make_pair(c, naive_trie()));
+            res.first->second.insert(key + 1, len - 1, value);
+        }
+    }
+    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
+        if (len == 0 || offset == len) {
+            return std::make_pair(key, offset);
+        }
+        char c = key[offset];
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return res->second.get_longest_prefix(key, len, offset + 1);
+        } else {
+            return std::make_pair(key, offset);
+        }
+    }
+    struct naive_trie * traverse(const char c) {
+        auto res = children.find(c);
+        if (res != children.end()) {
+            return &res->second;
+        } else {
+            return NULL;
+        }
+    }
+    std::map<char, struct naive_trie> children;
+    bool has_value;
+    llama_token value;
+};
+
+//
+// impl
+//
+
+int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
+    GGML_ASSERT(token_left.find(' ')   == std::string::npos);
+    GGML_ASSERT(token_left.find('\n')  == std::string::npos);
+    GGML_ASSERT(token_right.find(' ')  == std::string::npos);
+    GGML_ASSERT(token_right.find('\n') == std::string::npos);
+
+    auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+    if (it == bpe_ranks.end()) {
+        return -1;
+    }
+
+    return it->second;
+}
+
+static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
+    return vocab.type;
+}
+
+static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
+}
+
+static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
+}
+
+static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
+}
+
+static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
+}
+
+static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
+}
+
+static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
+}
+
+static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
+    GGML_ASSERT(llama_is_byte_token(vocab, id));
+    const auto & token_data = vocab.id_to_token.at(id);
+    switch (llama_vocab_get_type(vocab)) {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
+            auto buf = token_data.text.substr(3, 2);
+            return strtol(buf.c_str(), NULL, 16);
+        }
+        case LLAMA_VOCAB_TYPE_BPE: {
+            GGML_ASSERT(false);
+            return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
+        }
+        case LLAMA_VOCAB_TYPE_WPM: {
+            GGML_ASSERT(false);
+        }
+        default:
+            GGML_ASSERT(false);
+    }
+}
+
+static void llama_escape_whitespace(std::string & text) {
+    replace_all(text, " ", "\xe2\x96\x81");
+}
+
+static void llama_unescape_whitespace(std::string & word) {
+    replace_all(word, "\xe2\x96\x81", " ");
+}
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+};
+
+static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
+
+//
+// SPM tokenizer
+// original implementation:
+// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
+//
+
+struct llm_bigram_spm {
+    struct comparator {
+        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
+            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+        }
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llm_tokenizer_spm {
+    llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
+
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // split string into utf8 chars
+        int index = 0;
+        size_t offs = 0;
+        while (offs < text.size()) {
+            llm_symbol sym;
+            size_t len = unicode_len_utf8(text[offs]);
+            sym.text = text.c_str() + offs;
+            sym.n = std::min(len, text.size() - offs);
+            offs += sym.n;
+            sym.prev = index - 1;
+            sym.next = offs == text.size() ? -1 : index + 1;
+            index++;
+            symbols.emplace_back(sym);
+        }
+
+        // seed the work queue with all possible 2-character tokens.
+        for (size_t i = 1; i < symbols.size(); ++i) {
+            try_add_bigram(i - 1, i);
+        }
+
+        // keep substituting the highest frequency pairs for as long as we can.
+        while (!work_queue.empty()) {
+            auto bigram = work_queue.top();
+            work_queue.pop();
+
+            auto & left_sym = symbols[bigram.left];
+            auto & right_sym = symbols[bigram.right];
+
+            // if one of the symbols already got merged, skip it.
+            if (left_sym.n == 0 || right_sym.n == 0 ||
+                left_sym.n + right_sym.n != bigram.size) {
+                continue;
+            }
+
+            // merge the right sym into the left one
+            left_sym.n += right_sym.n;
+            right_sym.n = 0;
+
+            //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
+
+            // remove the right sym from the chain
+            left_sym.next = right_sym.next;
+            if (right_sym.next >= 0) {
+                symbols[right_sym.next].prev = bigram.left;
+            }
+
+            // find more substitutions
+            try_add_bigram(left_sym.prev, bigram.left);
+            try_add_bigram(bigram.left, left_sym.next);
+        }
+
+        for (int i = 0; i != -1; i = symbols[i].next) {
+            auto & symbol = symbols[i];
+            resegment(symbol, output);
+        }
+    }
+
+private:
+    void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
+        auto text = std::string(symbol.text, symbol.n);
+        auto token = vocab.token_to_id.find(text);
+
+        // Do we need to support is_unused?
+        if (token != vocab.token_to_id.end()) {
+            output.push_back((*token).second);
+            return;
+        }
+
+        const auto p = rev_merge.find(text);
+
+        if (p == rev_merge.end()) {
+            // output any symbols that did not form tokens as bytes.
+            output.reserve(output.size() + symbol.n);
+            for (int j = 0; j < (int)symbol.n; ++j) {
+                llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]);
+                output.push_back(token_id);
+            }
+            return;
+        }
+
+        resegment(symbols[p->second.first],  output);
+        resegment(symbols[p->second.second], output);
+    }
+
+    void try_add_bigram(int left, int right) {
+        if (left == -1 || right == -1) {
+            return;
+        }
+
+        const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
+        auto token = vocab.token_to_id.find(text);
+
+        if (token == vocab.token_to_id.end()) {
+            return;
+        }
+
+        if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
+            return;
+        }
+
+        const auto & tok_data = vocab.id_to_token[(*token).second];
+
+        llm_bigram_spm bigram;
+        bigram.left  = left;
+        bigram.right = right;
+        bigram.score = tok_data.score;
+        bigram.size  = text.size();
+
+        work_queue.push(bigram);
+
+        // Do we need to support is_unused?
+        rev_merge[text] = std::make_pair(left, right);
+    }
+
+    const llama_vocab & vocab;
+
+    std::vector<llm_symbol> symbols;
+    llm_bigram_spm::queue work_queue;
+
+    std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+//
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+//
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+struct llm_bigram_bpe {
+    struct comparator {
+        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
+            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
+        }
+    };
+
+    using queue_storage = std::vector<llm_bigram_bpe>;
+    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    std::string text;
+    int rank;
+    size_t size;
+};
+
+struct llm_tokenizer_bpe {
+    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
+        GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
+        switch (vocab.type_pre) {
+            case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+
+                    // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_DBRX:
+            case LLAMA_VOCAB_PRE_TYPE_SMAUG:
+                regex_exprs = {
+                    // same as llama3
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
+                regex_exprs = {
+                    "[\r\n]",
+                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
+                    "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
+                    "\\s+$",
+                    "[一-龥ࠀ-一가-퟿]+",
+                    "\\p{N}+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
+                regex_exprs = {
+                    "[\r\n]",
+                    "\\s?\\p{L}+",
+                    "\\s?\\p{P}+",
+                    "[一-龥ࠀ-一가-퟿]+",
+                    "\\p{N}",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_FALCON:
+                regex_exprs = {
+                    "[\\p{P}\\$\\+<=>\\^~\\|`]+",
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                    "[0-9][0-9][0-9]",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
+            case LLAMA_VOCAB_PRE_TYPE_REFACT:
+            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
+            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
+                regex_exprs = {
+                    "\\p{N}",
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_GPT2:
+            case LLAMA_VOCAB_PRE_TYPE_MPT:
+            case LLAMA_VOCAB_PRE_TYPE_OLMO:
+            case LLAMA_VOCAB_PRE_TYPE_JAIS:
+                regex_exprs = {
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
+            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_PORO:
+                regex_exprs = {
+                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
+                regex_exprs = {
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                    "\\p{N}",
+                };
+                break;
+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
+                // original regex from tokenizer.json
+                // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                regex_exprs = {
+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
+            default:
+                // default regex for BPE tokenization pre-processing
+                regex_exprs = {
+                    "[\\p{P}\\$\\+<=>\\^~\\|]+",
+                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                    "\\p{N}+",
+                    "[0-9][0-9][0-9]",
+                };
+                break;
+        }
+    }
+
+    void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
+        output.push_back(token_id);
+    }
+
+    bool append_bos(std::vector<llama_vocab::id> & output) const {
+        if (vocab.tokenizer_add_bos) {
+            GGML_ASSERT(vocab.special_bos_id != -1);
+            output.push_back(vocab.special_bos_id);
+            return true;
+        }
+        return false;
+    }
+
+    bool append_eos(std::vector<llama_vocab::id> & output) const {
+        if (vocab.tokenizer_add_eos) {
+            GGML_ASSERT(vocab.special_eos_id != -1);
+            output.push_back(vocab.special_eos_id);
+            return true;
+        }
+        return false;
+    }
+
+    void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
+        if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+            LLAMA_LOG_WARN(
+                "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                "Are you sure this is what you want?\n", __FUNCTION__);
+        }
+        if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
+            LLAMA_LOG_WARN(
+                "%s: Added a EOS token to the prompt as specified by the model but the prompt "
+                "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
+                "Are you sure this is what you want?\n", __FUNCTION__);
+        }
+    }
+
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        int final_prev_index = -1;
+
+        const auto word_collection = unicode_regex_split(text, regex_exprs);
+
+        symbols_final.clear();
+
+        for (auto & word : word_collection) {
+            work_queue = llm_bigram_bpe::queue();
+            symbols.clear();
+
+            int index = 0;
+            size_t offset = 0;
+
+            if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+                offset = word.size();
+            }
+
+            while (offset < word.size()) {
+                llm_symbol sym;
+                size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset]));
+                sym.text = word.c_str() + offset;
+                sym.n = char_len;
+                offset += sym.n;
+                sym.prev = index - 1;
+                sym.next = offset == word.size() ? -1 : index + 1;
+                index++;
+                symbols.emplace_back(sym);
+            }
+            for (size_t i = 1; i < symbols.size(); ++i) {
+                add_new_bigram(i - 1, i);
+            }
+
+            // build token(s)
+            while (!work_queue.empty()) {
+                auto bigram = work_queue.top();
+                work_queue.pop();
+
+                auto & left_symbol = symbols[bigram.left];
+                auto & right_symbol = symbols[bigram.right];
+
+                if (left_symbol.n == 0 || right_symbol.n == 0) {
+                    continue;
+                }
+                std::string left_token = std::string(left_symbol.text, left_symbol.n);
+                std::string right_token = std::string(right_symbol.text, right_symbol.n);
+                if (left_token + right_token != bigram.text) {
+                    continue;  // Skip this bigram if it's outdated
+                }
+
+                // merge the right sym into the left one
+                left_symbol.n += right_symbol.n;
+                right_symbol.n = 0;
+
+                // remove the right sym from the chain
+                left_symbol.next = right_symbol.next;
+                if (right_symbol.next >= 0) {
+                    symbols[right_symbol.next].prev = bigram.left;
+                }
+
+                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
+                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
+            }
+
+            // add the finished tokens to the final list keeping correct order for next and prev
+            for (auto & sym : symbols) {
+                if (sym.n > 0) {
+                    sym.prev = final_prev_index;
+                    sym.next = -1;
+                    if (final_prev_index != -1) {
+                        symbols_final[final_prev_index].next = symbols_final.size();
+                    }
+                    symbols_final.emplace_back(sym);
+                    final_prev_index = symbols_final.size() - 1;
+                }
+            }
+        }
+
+        symbols = symbols_final;
+
+        if (!symbols.empty()) {
+            for (int i = 0; i != -1; i = symbols[i].next) {
+                auto & symbol = symbols[i];
+                if (symbol.n == 0) {
+                    continue;
+                }
+
+                const std::string str = std::string(symbol.text, symbol.n);
+                const auto token = vocab.token_to_id.find(str);
+
+                if (token == vocab.token_to_id.end()) {
+                    for (auto j = str.begin(); j != str.end(); ++j) {
+                        std::string byte_str(1, *j);
+                        auto token_multibyte = vocab.token_to_id.find(byte_str);
+                        if (token_multibyte != vocab.token_to_id.end()) {
+                            output.push_back(token_multibyte->second);
+                        }
+                    }
+                } else {
+                    output.push_back((*token).second);
+                }
+            }
+        }
+    }
+
+private:
+    void add_new_bigram(int left, int right) {
+        if (left == -1 || right == -1) {
+            return;
+        }
+
+        std::string left_token  = std::string(symbols[left].text,  symbols[left].n);
+        std::string right_token = std::string(symbols[right].text, symbols[right].n);
+
+        int rank_found = -1;
+
+        rank_found = vocab.find_bpe_rank(left_token, right_token);
+
+        if (rank_found < 0) {
+            return;
+        }
+
+        llm_bigram_bpe bigram;
+
+        bigram.left  = left;
+        bigram.right = right;
+        bigram.text  = left_token + right_token;
+        bigram.size  = left_token.size() + right_token.size();
+        bigram.rank  = rank_found;
+
+        work_queue.push(bigram);
+    }
+
+    const llama_vocab & vocab;
+
+    std::vector<std::string> regex_exprs;
+
+    std::vector<llm_symbol> symbols;
+    std::vector<llm_symbol> symbols_final;
+
+    llm_bigram_bpe::queue work_queue;
+};
+
+//
+// WPM tokenizer
+//
+
+struct llm_tokenizer_wpm {
+    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
+
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
+        const auto & token_map = vocab.token_to_id;
+
+        // normalize and split by whitespace
+        std::vector<std::string> words = preprocess(text);
+
+        // bos token prepended already
+
+        // find the longest tokens that form the words
+        for (const std::string & word : words) {
+            // skip empty words
+            if (word.size() == 0) {
+                continue;
+            }
+
+            // prepend phantom space
+            const std::string word1 = "\xe2\x96\x81" + word;
+            const int n = word1.size();
+
+            const size_t current_tokens = output.size();
+
+            // we're at the start of a new word
+            // move through character position in word
+            for (int i = 0; i < n; ++i) {
+                // loop through possible match length
+                bool match = false;
+                for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
+                    auto it = token_map.find(word1.substr(i, j - i));
+                    if (it != token_map.end()) {
+                        output.push_back(it->second);
+                        match = true;
+                        i = j - 1;
+                        break;
+                    }
+                }
+
+                if (!match) { // discard all
+                    output.resize(current_tokens);
+                    break;  // and discard next tokens
+                }
+            }
+
+            // we didn't find any matches for this word
+            if (current_tokens == output.size()) {
+                output.push_back(vocab.special_unk_id);
+            }
+        }
+    }
+
+    // TODO: reduce string copies by using cpts_offs array
+    std::vector<std::string> preprocess(const std::string & text) const {
+        const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
+        std::vector<std::string> words(1, "");
+
+        for (const uint32_t cpt : cpts_nfd) {
+            const auto flags = unicode_cpt_flags(cpt);
+
+            if (flags.is_whitespace) {
+                if (words.back().size()) {  // finish previous word if any
+                    words.emplace_back();
+                }
+                continue;
+            }
+
+            assert (!flags.is_separator);
+            if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
+                continue;
+            }
+
+            const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
+            if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
+                if (words.back().size()) {  // finish previous word if any
+                    words.emplace_back();
+                }
+                words.back() = s;       // single char word
+                words.emplace_back();   // start a new word
+            } else {
+                words.back() += s;  // append char to word
+            }
+        }
+
+        if (!words.back().size()) {
+            words.pop_back();
+        }
+
+        return words;
+    }
+
+    static bool is_chinese_char(uint32_t cpt) {
+        return
+            (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
+            (cpt >= 0x03400 && cpt <= 0x04DBF) ||
+            (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
+            (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
+            (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
+            (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
+            (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
+            (cpt >= 0x2F800 && cpt <= 0x2FA1F);
+            //(cpt >= 0x3000  && cpt <= 0x303F)  ||
+            //(cpt >= 0xFF00  && cpt <= 0xFFEF);
+    }
+
+    const llama_vocab & vocab;
+};
+
+//
+// UGM tokenizer
+//
+
+struct llm_tokenizer_ugm {
+    llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
+        if (vocab.precompiled_charsmap.size() > 0) {
+            size_t charsmap_offset = 0;
+
+            // First four bytes of precompiled_charsmap contains length of binary
+            // blob containing XOR-compressed compact double array (XCDA) entries
+            uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
+            charsmap_offset += sizeof(xcda_blob_size);
+            if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
+                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+            }
+
+            // Next xcda_blob_size bytes contain entries of XOR-compressed compact
+            // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
+            xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
+            xcda_array_size = xcda_blob_size / sizeof(uint32_t);
+            charsmap_offset += xcda_blob_size;
+
+            // Remaining bytes of precompiled charsmap contain null-terminated
+            // replacement strings for prefixes matched by the XCDA.
+            prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
+            prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
+        }
+
+        for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
+            const auto &token_data = vocab.id_to_token[id];
+
+            if (llama_is_normal_token(vocab, id)) {
+                min_score = std::min<float>(min_score, token_data.score);
+                max_score = std::max<float>(max_score, token_data.score);
+            }
+
+            if (llama_is_normal_token(vocab, id) ||
+                llama_is_user_defined_token(vocab, id) ||
+                llama_is_unused_token(vocab, id)) {
+                token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
+            }
+
+            if (llama_is_user_defined_token(vocab, id)) {
+                user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
+            }
+        }
+
+        unknown_token_score = min_score - unknown_token_score_penalty;
+    }
+
+    /* This implementation is based on SentencePiece optimized Viterbi algorithm for
+     * unigram language models. The general idea is to:
+     * - move along the input sequence in steps of one UTF code point,
+     * - at each step find all possible tokenizations of the prefix by
+     *   traversing the tokens trie,
+     * - for each tokenization store the best one so far (by higher score)
+     * - use the position in sequence after given token as an index to store
+     *   results
+     * - if there was no valid tokenization of the current UTF code point
+     *   then use unknown token with additional score penalty
+     * After processing the whole sequence we backtrack from the end to get
+     * the best tokenization.
+    */
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // normalize the input first
+        std::string normalized;
+        normalize(text, &normalized);
+        size_t input_len = normalized.size();
+        if (input_len == 0) {
+            return;
+        }
+
+        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
+        // at the beginning tokenization score is zero
+        tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
+
+        for (size_t input_offset = 0; input_offset < input_len;) {
+            size_t prefix_offset = input_offset;
+            // calculate how many code units are in the currently processed UTF code point
+            size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset);
+
+            // traverse the token matcher trie to find a matching token
+            bool single_codepoint_token_found = false;
+            const struct best_tokenization & current_best = tokenization_results[input_offset];
+            struct naive_trie * node  = token_matcher.traverse(normalized[prefix_offset++]);
+
+            while (prefix_offset <= input_len && node != NULL) {
+                // check if we found valid token in prefix
+                if (node->has_value) {
+                    // check if it corresponds to the whole UTF code point
+                    if (prefix_offset - input_offset == n_utf8_code_units) {
+                        single_codepoint_token_found = true;
+                    }
+                    llama_token token_id = node->value;
+                    const auto & token_data = vocab.id_to_token[token_id];
+
+                    // we set the user-defined token scores to 0 to make them more likely to be selected
+                    // (normal token scores are log probabilities, so they are negative)
+                    // score type is double here to make tokenization results exactly
+                    // the same as in the HF tokenizer using SentencePiece
+                    const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
+                    const double challenger_score = current_best.score_sum + token_score;
+                    struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+                    if (challenger_score > current_champ.score_sum) {
+                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        current_champ = challenger;
+                    }
+                }
+                node = node->traverse(normalized[prefix_offset++]);
+            }
+
+            // if we didn't find a valid token corresponding to the whole UTF code point
+            // then use unknown token as the tokenization of this UTF code point
+            if (!single_codepoint_token_found) {
+                const double challenger_score = current_best.score_sum + unknown_token_score;
+                prefix_offset = input_offset + n_utf8_code_units;
+                struct best_tokenization & current_champ = tokenization_results[prefix_offset];
+                if (challenger_score > current_champ.score_sum) {
+                    struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
+                    current_champ = challenger;
+                }
+            }
+
+            // move to the next UTF code point
+            input_offset += n_utf8_code_units;
+        }
+
+        // now backtrack from the end to gather token ids of the best tokenization
+        // merge sequences of consecutive unknown tokens into single unknown tokens
+        bool is_prev_unknown = false;
+        for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
+            bool is_unknown = tokenization.token_id == vocab.special_unk_id;
+            if (!(is_prev_unknown && is_unknown)) {
+                output.push_back(tokenization.token_id);
+            }
+            if (tokenization.input_offset == 0) {
+                break;
+            }
+            is_prev_unknown = is_unknown;
+        }
+
+        // reverse the output since we added tokens starting from the end of the input
+        std::reverse(output.begin(), output.end());
+    }
+
+private:
+    const llama_vocab & vocab;
+
+    // helper structure for returning normalization results
+    struct normalization_result {
+        const char * normalized;
+        size_t normalized_len;
+        size_t consumed_input;
+    };
+
+    void normalize(const std::string& input, std::string * normalized) {
+        normalized->clear();
+        normalized->reserve(input.size() * 3);
+
+        const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
+
+        bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
+        bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
+        bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
+
+        bool is_space_prepended = false;
+        bool processing_non_ws = false;
+
+        size_t input_len = input.size();
+
+        for (size_t input_offset = 0; input_offset < input_len; ) {
+            auto norm_res = normalize_prefix(input, input_offset);
+            for (size_t i = 0; i < norm_res.normalized_len; i++) {
+                char c = norm_res.normalized[i];
+                if (c != ' ') {
+                    if (!processing_non_ws) {
+                        processing_non_ws = true;
+                        if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
+                            normalized->append(space);
+                            is_space_prepended = true;
+                        }
+                    }
+                    normalized->push_back(c);
+                } else {
+                    if (processing_non_ws) {
+                        processing_non_ws = false;
+                    }
+                    if (!shall_merge_spaces) {
+                        normalized->append(space);
+                    }
+                }
+            }
+
+            input_offset += norm_res.consumed_input;
+        }
+
+        if (shall_append_space) {
+            normalized->append(space);
+        }
+    }
+
+    /*
+     * This structure is a view wrapper for XOR-compressed double array (XCDA)
+     * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
+     * Eeach bit-packed entry contains:
+     * - BASE array value in bits 10-30
+     * - LCHECK array value in bits 0-7
+     * - LEAF array value in bit 9
+     * Entries containing indexes of replacement sequences have set bit 31
+     */
+    struct xcda_array_view {
+    public:
+        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
+        }
+        uint32_t get_base(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
+        }
+        uint32_t get_lcheck(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return packed_node & ((1U << 31) | 0xff);
+        }
+        bool get_leaf(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return (packed_node >> 8) & 1;
+        }
+        uint32_t get_value(size_t index) {
+            uint32_t packed_node = get_node(index);
+            return packed_node & ((1U << 31) - 1);
+        }
+    private:
+        uint32_t get_node(size_t index) {
+            if (index > xcda_array_size) {
+                throw std::runtime_error("Index out of array bounds in XCDA array!");
+            }
+            return xcda_array[index];
+        }
+        const uint32_t * xcda_array;
+        size_t xcda_array_size;
+    };
+
+    struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
+        if (input_offset == input.size()) {
+            return { &input[input_offset], 0, 0 };
+        }
+
+        // if input prefix matches some user-defined token return this token as normalization result
+        auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+        if (user_defined_token_match.second > 0) {
+            return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
+        }
+
+        size_t longest_prefix_length = 0;
+        size_t longest_prefix_offset = 0;
+
+        if (xcda_array_size > 0) {
+            struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
+
+            // Find the longest normalized sequence matching the input prefix by walking
+            // the XOR-compressed compact double array (XCDA) starting from the root node
+            // We find the index of the next node by calculating BASE[s] ^ c where s is
+            // the index of the previous node and c is a numerical character value
+            uint32_t node_index = 0;
+            // get BASE of the root node
+            node_index = xcda_view.get_base(node_index);
+            for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
+                unsigned char c = input[prefix_offset];
+                if (c == 0) {
+                    break;
+                }
+                node_index ^= c;
+                // if value of LCHECK is not c it means that this is not a child of
+                // the previous node, so we stop matching
+                if (xcda_view.get_lcheck(node_index) != c) {
+                    break;
+                }
+                bool is_leaf = xcda_view.get_leaf(node_index);
+                // get BASE of the current node
+                node_index ^= xcda_view.get_base(node_index);
+                // if LEAF of the current node is true, it means that its BASE points to the node
+                // containing index of replacement sequence for currently matched input prefix
+                if (is_leaf)
+                {
+                    longest_prefix_length = prefix_offset - input_offset + 1;
+                    // get index of replacement sequence for currently matched input prefix
+                    longest_prefix_offset = xcda_view.get_value(node_index);
+                }
+            }
+        }
+
+        if (longest_prefix_length > 0) {
+            // we have a match, so return the replacement sequence
+            if (longest_prefix_offset >= prefix_replacements_size) {
+                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
+            }
+            const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
+            return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
+        } else {
+            // check if the input prefix contains a valid sequence of UTF-8 code units
+            try {
+                // if yes, return this sequence unmodified
+                size_t prefix_offset = input_offset;
+                unicode_cpt_from_utf8(input, prefix_offset);
+                return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
+            } catch (std::invalid_argument & /*ex*/) {
+                // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
+                return { "\xEF\xBF\xBD", 3, 1 };
+            }
+        }
+    }
+
+    // escaped space symbol - U+2581 (Lower One Eighth Block)
+    const std::string escaped_space = "\xE2\x96\x81";
+
+    const char * prefix_replacements = NULL;
+    size_t prefix_replacements_size = 0;
+
+    const uint32_t * xcda_array = NULL;
+    size_t xcda_array_size = 0;
+
+    struct naive_trie user_defined_token_matcher;
+
+    // this structure stores the best tokenization so far at input_offset
+    struct best_tokenization {
+        llama_token token_id;
+        size_t input_offset;
+        float score_sum;
+    };
+
+    float min_score = FLT_MAX;
+    float max_score = -FLT_MAX;
+
+    float unknown_token_score_penalty = 10.0;
+    float unknown_token_score;
+
+    struct naive_trie token_matcher;
+};
+
+//
+// (de-) tokenize
+//
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant {
+    fragment_buffer_variant(llama_vocab::id _token)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
+        token(_token),
+        raw_text(_dummy),
+        offset(0),
+        length(0) {}
+
+    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
+    :
+        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
+        token((llama_vocab::id) - 1),
+        raw_text(_raw_text),
+        offset(_offset),
+        length(_length){
+            GGML_ASSERT(_offset >= 0);
+            GGML_ASSERT(_length >= 1);
+            GGML_ASSERT(offset + length <= raw_text.length());
+        }
+
+    const FRAGMENT_BUFFER_VARIANT_TYPE type;
+    const llama_vocab::id token;
+    const std::string _dummy;
+    const std::string & raw_text;
+    const uint64_t offset;
+    const uint64_t length;
+};
+
+// #define PRETOKENIZERDEBUG
+
+static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
+    // for each special token
+    for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
+        const auto & data = vocab.id_to_token[special_id];
+        const auto & special_token = data.text;
+
+        if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
+            // Ignore control and unknown tokens when parse_special == false
+            continue;
+            // User-defined tokens are still pre-tokenized before everything else
+            // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
+            // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
+        }
+
+        // for each text fragment
+        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
+        while (it != buffer.end()) {
+            auto & fragment = (*it);
+
+            // if a fragment is text ( not yet processed )
+            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                auto & raw_text = fragment.raw_text;
+
+                auto raw_text_base_offset = fragment.offset;
+                auto raw_text_base_length = fragment.length;
+
+                // loop over the text
+                while (true) {
+                    // find the first occurrence of a given special token in this fragment
+                    //  passing offset argument only limit the "search area" but match coordinates
+                    //  are still relative to the source full raw_text
+                    auto match = raw_text.find(special_token, raw_text_base_offset);
+
+                    // no occurrences found, stop processing this fragment for a given special token
+                    if (match == std::string::npos) break;
+
+                    // check if match is within bounds of offset <-> length
+                    if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
+
+#ifdef PRETOKENIZERDEBUG
+                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    auto source = std::distance(buffer.begin(), it);
+
+                    // if match is further than base offset
+                    //  then we have some text to the left of it
+                    if (match > raw_text_base_offset) {
+                        // left
+                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
+                        int64_t left_reminder_length = match - raw_text_base_offset;
+
+                        if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
+                            while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
+                                left_reminder_length--;
+                            }
+                        }
+
+                        if (left_reminder_length > 0) {
+                            buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
+                            it++;
+                        }
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+#endif
+                    }
+
+                    // special token
+                    buffer.emplace_after(it, special_id);
+                    it++;
+
+                    // right
+                    if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
+                        int64_t right_reminder_offset = match + special_token.length();
+                        int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
+
+                        if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
+                            while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
+                                right_reminder_offset++;
+                                right_reminder_length--;
+                            }
+                        }
+
+                        if (right_reminder_length > 0) {
+                            buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
+                            it++;
+                        }
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+#endif
+
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                        }
+
+                        // repeat for the right side
+                        raw_text_base_offset = right_reminder_offset;
+                        raw_text_base_length = right_reminder_length;
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+                    } else {
+                        if (source == 0) {
+                            buffer.erase_after(buffer.before_begin());
+                        } else {
+                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
+                        }
+                        break;
+                    }
+                }
+            }
+            it++;
+        }
+    }
+}
+
+std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
+    std::vector<llama_vocab::id> output;
+    std::forward_list<fragment_buffer_variant> fragment_buffer;
+
+    if (!raw_text.empty()) {
+        fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
+        tokenizer_st_partition(vocab, fragment_buffer, parse_special);
+    }
+
+    switch (vocab.type) {
+        case LLAMA_VOCAB_TYPE_SPM:
+            {
+                // OG tokenizer behavior:
+                //
+                // tokenizer.encode('', add_special_tokens=True)  returns [1]
+                // tokenizer.encode('', add_special_tokens=False) returns []
+
+                bool is_prev_special = true;  // prefix with space if first token
+
+                if (add_special && vocab.tokenizer_add_bos) {
+                    GGML_ASSERT(vocab.special_bos_id != -1);
+                    output.push_back(vocab.special_bos_id);
+                    is_prev_special = true;
+                }
+
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+                        // prefix with space if previous is special
+                        if (vocab.tokenizer_add_space_prefix && is_prev_special) {
+                            raw_text = " " + raw_text;
+                        }
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        llm_tokenizer_spm tokenizer(vocab);
+                        llama_escape_whitespace(raw_text);
+                        tokenizer.tokenize(raw_text, output);
+                        is_prev_special = false;
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                        is_prev_special = true;
+                    }
+                }
+
+                if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+                    LLAMA_LOG_WARN(
+                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                        "Are you sure this is what you want?\n", __FUNCTION__);
+                }
+
+                if (add_special && vocab.tokenizer_add_eos) {
+                    GGML_ASSERT(vocab.special_eos_id != -1);
+                    output.push_back(vocab.special_eos_id);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_BPE:
+            {
+                llm_tokenizer_bpe tokenizer(vocab);
+
+                if (add_special) {
+                    tokenizer.append_bos(output);
+                }
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        tokenizer.tokenize(raw_text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        tokenizer.append(fragment.token, output);
+                    }
+                }
+
+                if (add_special) {
+                    tokenizer.append_eos(output);
+                    tokenizer.check_double_bos_eos(output);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_WPM:
+            {
+                if (add_special) {
+                    GGML_ASSERT(vocab.special_cls_id != -1);
+                    output.push_back(vocab.special_cls_id);
+                }
+
+                llm_tokenizer_wpm tokenizer(vocab);
+
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        tokenizer.tokenize(raw_text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                    }
+                }
+
+                if (add_special) {
+                    GGML_ASSERT(vocab.special_sep_id != -1);
+                    output.push_back(vocab.special_sep_id);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_UGM:
+            {
+                llm_tokenizer_ugm tokenizer(vocab);
+
+                if (add_special && vocab.tokenizer_add_bos != 0) {
+                    GGML_ASSERT(vocab.special_bos_id != -1);
+                    output.push_back(vocab.special_bos_id);
+                }
+
+                for (const auto & fragment : fragment_buffer) {
+                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+#ifdef PRETOKENIZERDEBUG
+                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
+#endif
+                        tokenizer.tokenize(raw_text, output);
+                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+                        output.push_back(fragment.token);
+                    }
+                }
+
+                if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+                    LLAMA_LOG_WARN(
+                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+                        "Are you sure this is what you want?\n", __FUNCTION__);
+                }
+
+                if (add_special && vocab.tokenizer_add_eos == 1) {
+                    GGML_ASSERT(vocab.special_eos_id != -1);
+                    output.push_back(vocab.special_eos_id);
+                }
+            } break;
+        case LLAMA_VOCAB_TYPE_NONE:
+            GGML_ASSERT(false);
+    }
+
+    return output;
+}
+
+llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
+    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
+    static const char * hex = "0123456789ABCDEF";
+    switch (llama_vocab_get_type(vocab)) {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
+            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+            auto token = vocab.token_to_id.find(buf);
+            if (token != vocab.token_to_id.end()) {
+                return (*token).second;
+            }
+            // Try to fall back to just the byte as a string
+            const char buf2[2] = { (char)ch, 0 };
+            return vocab.token_to_id.at(buf2);
+        }
+        case LLAMA_VOCAB_TYPE_WPM:
+        case LLAMA_VOCAB_TYPE_BPE: {
+            return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
+        }
+        default:
+            GGML_ASSERT(false);
+    }
+}
+
+const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[token].text.c_str();
+}
+
+float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[token].score;
+}
+
+llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
+    return vocab.id_to_token[token].attr;
+}
+
+bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
+    return token != -1 && (
+        token == llama_token_eos_impl(vocab) ||
+        token == llama_token_eot_impl(vocab)
+    );
+}
+
+bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
+    return llama_is_control_token(vocab, token);
+}
+
+llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
+    return vocab.special_bos_id;
+}
+
+llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eos_id;
+}
+
+llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
+    return vocab.special_cls_id;
+}
+
+llama_token llama_token_sep_impl(const struct llama_vocab & vocab) {
+    return vocab.special_sep_id;
+}
+
+llama_token llama_token_nl_impl(const struct llama_vocab & vocab) {
+    return vocab.linefeed_id;
+}
+
+llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
+    return vocab.special_pad_id;
+}
+
+int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) {
+    return vocab.tokenizer_add_bos;
+}
+
+int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) {
+    return vocab.tokenizer_add_eos;
+}
+
+llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
+    return vocab.special_prefix_id;
+}
+
+llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
+    return vocab.special_middle_id;
+}
+
+llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
+    return vocab.special_suffix_id;
+}
+
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
+    return vocab.special_eot_id;
+}
+
+int32_t llama_tokenize_impl(
+    const struct llama_vocab & vocab,
+                  const char * text,
+                     int32_t   text_len,
+                 llama_token * tokens,
+                     int32_t   n_tokens_max,
+                        bool   add_special,
+                        bool   parse_special) {
+    auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
+    if (n_tokens_max < (int) res.size()) {
+        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        return -((int) res.size());
+    }
+
+    for (size_t i = 0; i < res.size(); i++) {
+        tokens[i] = res[i];
+    }
+
+    return res.size();
+}
+
+static std::string llama_decode_text(const std::string & text) {
+    std::string decoded_text;
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+    for (const auto cpt : cpts) {
+        const auto utf8 = unicode_cpt_to_utf8(cpt);
+        try {
+            decoded_text += unicode_utf8_to_byte(utf8);
+        } catch (const std::out_of_range & /*e*/) {
+            decoded_text += "[UNK_BYTE_0x";
+            for (const auto c : utf8) {
+                decoded_text += format("%02x", (uint8_t) c);
+            }
+            decoded_text += text + "]";
+        }
+    }
+
+    return decoded_text;
+}
+
+// does not write null-terminator to buf
+int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
+    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
+    static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
+    const llama_token_attr attr = llama_token_get_attr_impl(vocab, token);
+    if (!special && (attr & attr_special)) {
+        return 0;
+    }
+
+    // copy piece chars to output text buffer
+    // skip up to 'lstrip' leading spaces before copying
+    auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
+        for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
+            token++;
+            size--;
+        }
+        if (length < (int32_t)size) {
+            return -(int32_t) size;
+        }
+        memcpy(buf, token, size);
+        return (int32_t) size;
+    };
+
+    // if we have a cache - use it
+    {
+        const auto & cache = vocab.cache_token_to_piece;
+
+        if (!cache.empty()) {
+            const auto & result = cache.at(token);
+            return _try_copy(result.data(), result.size());
+        }
+    }
+
+    if (0 <= token && token < (int32_t) vocab.id_to_token.size()) {
+        const std::string & token_text = vocab.id_to_token[token].text;
+        switch (llama_vocab_get_type(vocab)) {
+            case LLAMA_VOCAB_TYPE_WPM:
+            case LLAMA_VOCAB_TYPE_SPM:
+            case LLAMA_VOCAB_TYPE_UGM: {
+                // NOTE: we accept all unsupported token types,
+                // suppressing them like CONTROL tokens.
+                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+                    return _try_copy(token_text.data(), token_text.size());
+                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    std::string result = token_text;
+                    llama_unescape_whitespace(result);
+                    return _try_copy(result.data(), result.size());
+                } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
+                    char byte = (char) llama_token_to_byte(vocab, token);
+                    return _try_copy((char*) &byte, 1);
+                }
+                break;
+            }
+            case LLAMA_VOCAB_TYPE_BPE: {
+                // NOTE: we accept all unsupported token types,
+                // suppressing them like CONTROL tokens.
+                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
+                    return _try_copy(token_text.data(), token_text.size());
+                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
+                    std::string result = llama_decode_text(token_text);
+                    return _try_copy(result.data(), result.size());
+                }
+                break;
+            }
+            default:
+                GGML_ASSERT(false);
+        }
+    }
+
+    return 0;
+}
+
+int32_t llama_detokenize_impl(
+        const struct llama_vocab & vocab,
+               const llama_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special) {
+    int32_t avail = text_len_max;
+    int32_t total = 0;
+
+    // remove the leading space
+    bool remove_space = vocab.tokenizer_add_space_prefix;
+
+    if (remove_special && vocab.tokenizer_add_bos) {
+        if (n_tokens > 0 && tokens[0] == vocab.special_bos_id) {
+            remove_space = false;
+            n_tokens--;
+            tokens++;
+        }
+    }
+
+    if (remove_special && vocab.tokenizer_add_eos) {
+        if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
+            n_tokens--;
+        }
+    }
+
+    for (int32_t i = 0; i < n_tokens; ++i) {
+        GGML_ASSERT(avail >= 0);
+        int32_t n_chars = llama_token_to_piece_impl(vocab, tokens[i], text, avail, remove_space, unparse_special);
+        remove_space = false;
+        if (n_chars < 0) {
+            avail = 0;
+            total -= n_chars;
+        } else if (n_chars > 0) {
+            avail -= n_chars;
+            text  += n_chars;
+            total += n_chars;
+        }
+    }
+
+    if (total > text_len_max) {
+        return -total;
+    }
+
+    if (vocab.tokenizer_clean_spaces) {
+        text -= total;  // restart text
+
+        // first pass: characters ?!.,  //TODO: where do these characters come from?
+        const int32_t total1 = total;
+        total = total ? 1 : 0;
+        for (int32_t i = 1; i < total1; ++i) {
+            const char x = text[i];
+            if (text[i - 1] == ' ') {
+                if (x == '?' || x == '!' || x == '.' || x == ',') {  // " ?", " !", " .", " ,"
+                    total--;  // remove space
+                }
+            }
+            text[total++] = x;
+        }
+
+        // second pass: strip single apostrophe between spaces
+        const int32_t total2 = total;
+        total = total ? 1 : 0;
+        for (int32_t i = 1; i < total2; ++i) {
+            const char x = text[i];
+            if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') {  // " ' "
+                total--;           // remove prev space
+                text[++i] = '\0';  // remove next space
+            }
+            text[total++] = x;
+        }
+
+        // third pass: apostrophe contractions  //NOTE: this makes sense?
+        const int32_t total3 = total;
+        total = total ? 1 : 0;
+        for (int32_t i = 1; i < total3; ++i) {
+            const char x = text[i];
+            if (text[i - 1] == ' ') {
+                if (x == '\'' && i + 1 < total3) {
+                    const char x1 = text[i + 1];
+                    if (x1 == 't' || x1 == 'd') {  // " 't", " 'd"
+                        //total--;  // remove space
+                    } else if (x1 == 's' || x1 == 'm') {  // " 's", " 'm"
+                        total--;  // remove space
+                    } else if (i + 2 < total3) {
+                        const char x2 = text[i + 2];
+                        if ((x1 == 'l' && x2 == 'l')) {  // " 'll"
+                            //total--;  // remove space
+                        } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) {  // " 're", " 've"
+                            total--;  // remove space
+                        } else {
+                            //total--;  // remove space
+                        }
+                    } else {
+                        //total--;  // remove space
+                    }
+                }
+            }
+            text[total++] = x;
+        }
+    }
+
+    return total <= text_len_max ? total : -total;
+}
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
new file mode 100644
index 000000000..30b565d55
--- /dev/null
+++ b/src/llama-vocab.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include "llama-impl.h"
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <map>
+
+struct llama_vocab {
+    using id    = llama_token;
+    using token = std::string;
+    using tattr = llama_token_attr;
+
+    struct token_data {
+        token text;
+        float score;
+        tattr attr;
+    };
+
+    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
+    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+
+    int max_token_len = 0; // used for optimizing longest token search
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::vector<id>    cache_special_tokens;
+    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id  = 1;
+    id special_eos_id  = 2;
+    id special_unk_id  = 0;
+    id special_sep_id  = -1;
+    id special_pad_id  = -1;
+    id special_cls_id  = -1;
+    id special_mask_id = -1;
+
+    id linefeed_id       = 13;
+    id special_prefix_id = -1;
+    id special_suffix_id = -1;
+    id special_middle_id = -1;
+    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
+
+    // tokenizer flags
+    bool tokenizer_add_space_prefix = false;
+    bool tokenizer_add_bos          = false;
+    bool tokenizer_add_eos          = false;
+    bool tokenizer_ignore_merges    = false;
+    bool tokenizer_clean_spaces     = false;  // clean_up_tokenization_spaces
+    bool tokenizer_remove_extra_whitespaces   = false;
+    bool tokenizer_escape_whitespaces         = true;
+    bool tokenizer_treat_whitespace_as_suffix = false;
+
+    std::vector<char> precompiled_charsmap;
+
+    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+};
+
+const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
+
+//
+// internal API
+//
+
+// TODO: rename to llama_tokenize_impl
+// TODO: This should probably be in llama.h
+std::vector<llama_vocab::id> llama_tokenize_internal(
+        const llama_vocab & vocab,
+        std::string raw_text,
+        bool add_special,
+        bool parse_special = false);
+
+llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
+
+const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
+
+float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
+
+llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
+
+bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
+
+bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
+
+llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
+llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
+llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
+llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
+
+int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
+int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
+
+llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
+llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
+llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
+
+int32_t llama_tokenize_impl(
+        const struct llama_vocab & vocab,
+                      const char * text,
+                         int32_t   text_len,
+                     llama_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special);
+
+// does not write null-terminator to buf
+int32_t llama_token_to_piece_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token,
+                            char * buf,
+                         int32_t   length,
+                         int32_t   lstrip,
+                            bool   special);
+
+int32_t llama_detokenize_impl(
+        const struct llama_vocab & vocab,
+               const llama_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special);
diff --git a/src/llama.cpp b/src/llama.cpp
index 7d68ed811..bc0183741 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1,5 +1,7 @@
-#define LLAMA_API_INTERNAL
-#include "llama.h"
+#include "llama-impl.h"
+#include "llama-vocab.h"
+#include "llama-grammar.h"
+#include "llama-sampling.h"
 
 #include "unicode.h"
 
@@ -79,7 +81,6 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <forward_list>
 #include <fstream>
 #include <functional>
 #include <future>
@@ -89,9 +90,6 @@
 #include <memory>
 #include <mutex>
 #include <numeric>
-#include <queue>
-#include <random>
-#include <regex>
 #include <set>
 #include <sstream>
 #include <thread>
@@ -102,41 +100,26 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_ATTRIBUTE_FORMAT(...)
-#endif
-
 // bump if necessary
 #define LLAMA_MAX_NODES   8192
 #define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
 
-//
-// logging
-//
-
-LLAMA_ATTRIBUTE_FORMAT(2, 3)
-static void llama_log_internal        (ggml_log_level level, const char * format, ...);
-static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
-
-#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
-#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
-#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-
 //
 // helpers
 //
 
-static size_t utf8_len(char src) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+    while (start < end && isspace(str[start])) {
+        start += 1;
+    }
+    while (end > start && isspace(str[end - 1])) {
+        end -= 1;
+    }
+    return str.substr(start, end - start);
 }
 
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
@@ -2583,72 +2566,6 @@ struct llama_control_vector {
     }
 };
 
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using tattr = llama_token_attr;
-
-    struct token_data {
-        token text;
-        float score;
-        tattr attr;
-    };
-
-    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
-    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-
-    int max_token_len = 0; // used for optimizing longest token search
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_data>       id_to_token;
-
-    std::vector<id>    cache_special_tokens;
-    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
-
-    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
-
-    // default LLaMA special tokens
-    id special_bos_id  = 1;
-    id special_eos_id  = 2;
-    id special_unk_id  = 0;
-    id special_sep_id  = -1;
-    id special_pad_id  = -1;
-    id special_cls_id  = -1;
-    id special_mask_id = -1;
-
-    id linefeed_id       = 13;
-    id special_prefix_id = -1;
-    id special_suffix_id = -1;
-    id special_middle_id = -1;
-    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
-
-    // tokenizer flags
-    bool tokenizer_add_space_prefix = false;
-    bool tokenizer_add_bos          = false;
-    bool tokenizer_add_eos          = false;
-    bool tokenizer_ignore_merges    = false;
-    bool tokenizer_clean_spaces     = false;  // clean_up_tokenization_spaces
-    bool tokenizer_remove_extra_whitespaces   = false;
-    bool tokenizer_escape_whitespaces         = true;
-    bool tokenizer_treat_whitespace_as_suffix = false;
-
-    std::vector<char> precompiled_charsmap;
-
-    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
-        GGML_ASSERT(token_left.find(' ') == std::string::npos);
-        GGML_ASSERT(token_left.find('\n') == std::string::npos);
-        GGML_ASSERT(token_right.find(' ') == std::string::npos);
-        GGML_ASSERT(token_right.find('\n') == std::string::npos);
-
-        auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
-        if (it == bpe_ranks.end()) {
-            return -1;
-        }
-
-        return it->second;
-    }
-};
-
 struct llama_model {
     e_model     type  = MODEL_UNKNOWN;
     llm_arch    arch  = LLM_ARCH_UNKNOWN;
@@ -2737,7 +2654,12 @@ struct llama_model {
 };
 
 struct llama_context {
-    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
+    llama_context(const llama_model & model)
+        : model(model)
+        , sampling(llama_n_vocab(&model))
+        , t_start_us(model.t_start_us)
+        , t_load_us(model.t_load_us) {}
+
     ~llama_context() {
         ggml_backend_sched_free(sched);
 
@@ -2748,7 +2670,14 @@ struct llama_context {
         ggml_backend_buffer_free(buf_output);
     }
 
-    llama_cparams cparams;
+    const struct llama_model & model;
+
+    struct llama_cparams        cparams;
+    struct llama_sampling       sampling;
+    struct llama_kv_cache       kv_self;
+    struct llama_control_vector cvec;
+
+    std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
 
     std::vector<ggml_backend_t> backends;
 #ifdef GGML_USE_METAL
@@ -2759,26 +2688,16 @@ struct llama_context {
 #endif
     ggml_backend_t backend_cpu = nullptr;
 
-
-    const llama_model & model;
-
-    // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
-
-    std::mt19937 rng;
-
     bool has_evaluated_once = false;
 
     int64_t t_start_us;
     int64_t t_load_us;
-    int64_t t_sample_us = 0;
     int64_t t_p_eval_us = 0;
     int64_t t_eval_us   = 0;
 
     int64_t t_compute_start_us = 0;
     int64_t n_queued_tokens = 0;
 
-    int32_t n_sample = 0; // number of tokens sampled
     int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
     int32_t n_eval   = 0; // number of eval calls
 
@@ -2834,12 +2753,6 @@ struct llama_context {
     struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
     struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
     struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
-
-    // control vectors
-    struct llama_control_vector cvec;
-
-    // lora adapters and scales
-    std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
 };
 
 struct llama_lora_weight {
@@ -3707,7 +3620,7 @@ struct llama_model_loader {
         }
 
         if (param_overrides_p != nullptr) {
-            for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
+            for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
                 kv_overrides.insert({std::string(p->key), *p});
             }
         }
@@ -3875,7 +3788,7 @@ struct llama_model_loader {
             ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
 
             {
-                const int kid = gguf_find_key(meta, "general.file_type");
+                const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
                 if (kid >= 0) {
                     ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
                 }
@@ -4974,6 +4887,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_PHI3:
             {
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
                 switch (hparams.n_layer) {
@@ -5007,7 +4921,7 @@ static void llm_load_hparams(
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                 switch (hparams.n_layer) {
-                    case 42: model.type = e_model::MODEL_SMALL; break;
+                    case 42: model.type = e_model::MODEL_7B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
@@ -5302,12 +5216,6 @@ static void llm_load_hparams(
     hparams.rope_type = llama_rope_type(&model);
 }
 
-// TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(
-    const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false
-);
-static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
-
 static void llm_load_vocab(
         llama_model_loader & ml,
         llama_model & model) {
@@ -5369,6 +5277,7 @@ static void llm_load_vocab(
             if (merges_keyidx == -1) {
                 throw std::runtime_error("cannot find tokenizer merges in model file\n");
             }
+
             const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
             for (int i = 0; i < n_merges; i++) {
                 const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
@@ -5407,16 +5316,6 @@ static void llm_load_vocab(
             vocab.special_cls_id  = -1;
             vocab.special_mask_id = -1;
 
-            const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
-            if (add_space_prefix_keyidx != -1) {
-                vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
-            } // The default value of add_space_prefix is true.
-
-            const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
-            if (remove_extra_whitespaces_keyidx != -1) {
-                vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
-            } // The default value of remove_extra_whitespaces is false.
-
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
@@ -5524,6 +5423,19 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "jais") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+            } else if (
+                tokenizer_pre == "tekken") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+                vocab.tokenizer_clean_spaces = false;
+                vocab.tokenizer_ignore_merges = true;
+                vocab.tokenizer_add_bos = true;
+            } else if (
+                tokenizer_pre == "smollm") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                vocab.tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "codeshell") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -5547,10 +5459,8 @@ static void llm_load_vocab(
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
 
-        const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
-        if (add_space_prefix_keyidx != -1) {
-            vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
-        }
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      vocab.tokenizer_add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
     }
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@@ -5642,7 +5552,7 @@ static void llm_load_vocab(
             }
         }
         try {
-            vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+            vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
         } catch (const std::exception & e) {
             LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
             vocab.linefeed_id = vocab.special_pad_id;
@@ -6131,10 +6041,10 @@ static bool llm_load_tensors(
 
                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
 
                         // optional bias tensors
                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -10837,7 +10747,7 @@ struct llm_build_context {
         struct ggml_tensor * inp_pos = build_inp_pos();
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
 
         for (int il = 0; il < n_layer; ++il) {
             auto residual = inpL;
@@ -10895,7 +10805,7 @@ struct llm_build_context {
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                         model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
             }
 
             if (il == n_layer - 1) {
@@ -14102,18 +14012,23 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         "causal attention is not supported by this model"
     );
 
-    if (lctx.inp_KQ_mask) {
+    if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
         // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
         if (cparams.causal_attn && !lctx.is_encoding) {
             const int64_t n_kv     = kv_self.n;
             const int64_t n_tokens = batch.n_tokens;
 
-            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
 
-            float * data     = (float *) lctx.inp_KQ_mask->data;
+            float * data     = nullptr;
             float * data_swa = nullptr;
 
+            if (lctx.inp_KQ_mask) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
+                data = (float *) lctx.inp_KQ_mask->data;
+            }
+
             if (lctx.inp_KQ_mask_swa) {
+                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
                 data_swa = (float *) lctx.inp_KQ_mask_swa->data;
             }
 
@@ -14131,12 +14046,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                             f = -INFINITY;
                         } else {
                             if (hparams.use_alibi) {
-                                f = -fabs(lctx.kv_self.cells[i].pos - pos);
+                                f = -std::abs(lctx.kv_self.cells[i].pos - pos);
                             } else {
                                 f = 0.0f;
                             }
                         }
-                        data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+
+                        if (data) {
+                            data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
+                        }
 
                         // may need to cut off old tokens for sliding window
                         if (data_swa) {
@@ -14148,9 +14066,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                     }
                 }
 
-                for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                if (data) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
+                    }
+                }
+
+                if (data_swa) {
+                    for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
+                        for (int j = 0; j < n_kv; ++j) {
+                            data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
+                        }
                     }
                 }
             }
@@ -14172,7 +14100,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                         for (int s = 0; s < batch.n_seq_id[i]; ++s) {
                             if (batch.seq_id[i][s] == seq_id) {
                                 if (hparams.use_alibi) {
-                                    f = -fabs(batch.pos[i] - batch.pos[j]);
+                                    f = -std::abs(batch.pos[i] - batch.pos[j]);
                                 } else {
                                     f = 0.0f;
                                 }
@@ -15231,2510 +15159,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     }
 }
 
-//
-// tokenizer
-//
-
-static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
-    return vocab.type;
-}
-
-static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
-}
-
-static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
-}
-
-static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
-}
-
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
-}
-
-static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
-}
-
-static bool llama_is_unused_token(const llama_vocab& vocab, llama_token id) {
-    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
-}
-
-static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
-    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
-    GGML_ASSERT(llama_is_byte_token(vocab, id));
-    const auto & token_data = vocab.id_to_token.at(id);
-    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM:
-        case LLAMA_VOCAB_TYPE_UGM: {
-            auto buf = token_data.text.substr(3, 2);
-            return strtol(buf.c_str(), NULL, 16);
-        }
-        case LLAMA_VOCAB_TYPE_BPE: {
-            GGML_ASSERT(false);
-            return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
-        }
-        case LLAMA_VOCAB_TYPE_WPM: {
-            GGML_ASSERT(false);
-        }
-        default:
-            GGML_ASSERT(false);
-    }
-}
-
-static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
-    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
-    static const char * hex = "0123456789ABCDEF";
-    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM:
-        case LLAMA_VOCAB_TYPE_UGM: {
-            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
-            auto token = vocab.token_to_id.find(buf);
-            if (token != vocab.token_to_id.end()) {
-                return (*token).second;
-            }
-            // Try to fall back to just the byte as a string
-            const char buf2[2] = { (char)ch, 0 };
-            return vocab.token_to_id.at(buf2);
-        }
-        case LLAMA_VOCAB_TYPE_WPM:
-        case LLAMA_VOCAB_TYPE_BPE: {
-            return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
-        }
-        default:
-            GGML_ASSERT(false);
-    }
-}
-
-static void llama_escape_whitespace(std::string & text) {
-    replace_all(text, " ", "\xe2\x96\x81");
-}
-
-static void llama_unescape_whitespace(std::string & word) {
-    replace_all(word, "\xe2\x96\x81", " ");
-}
-
-struct llm_symbol {
-    using index = int;
-    index prev;
-    index next;
-    const char * text;
-    size_t n;
-};
-
-static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
-
-// SPM tokenizer
-// original implementation:
-// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
-
-struct llm_bigram_spm {
-    struct comparator {
-        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
-            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
-        }
-    };
-    using queue_storage = std::vector<llm_bigram_spm>;
-    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    float score;
-    size_t size;
-};
-
-struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
-
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        // split string into utf8 chars
-        int index = 0;
-        size_t offs = 0;
-        while (offs < text.size()) {
-            llm_symbol sym;
-            size_t len = utf8_len(text[offs]);
-            sym.text = text.c_str() + offs;
-            sym.n = std::min(len, text.size() - offs);
-            offs += sym.n;
-            sym.prev = index - 1;
-            sym.next = offs == text.size() ? -1 : index + 1;
-            index++;
-            symbols.emplace_back(sym);
-        }
-
-        // seed the work queue with all possible 2-character tokens.
-        for (size_t i = 1; i < symbols.size(); ++i) {
-            try_add_bigram(i - 1, i);
-        }
-
-        // keep substituting the highest frequency pairs for as long as we can.
-        while (!work_queue.empty()) {
-            auto bigram = work_queue.top();
-            work_queue.pop();
-
-            auto & left_sym = symbols[bigram.left];
-            auto & right_sym = symbols[bigram.right];
-
-            // if one of the symbols already got merged, skip it.
-            if (left_sym.n == 0 || right_sym.n == 0 ||
-                left_sym.n + right_sym.n != bigram.size) {
-                continue;
-            }
-
-            // merge the right sym into the left one
-            left_sym.n += right_sym.n;
-            right_sym.n = 0;
-
-            //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
-
-            // remove the right sym from the chain
-            left_sym.next = right_sym.next;
-            if (right_sym.next >= 0) {
-                symbols[right_sym.next].prev = bigram.left;
-            }
-
-            // find more substitutions
-            try_add_bigram(left_sym.prev, bigram.left);
-            try_add_bigram(bigram.left, left_sym.next);
-        }
-
-        for (int i = 0; i != -1; i = symbols[i].next) {
-            auto & symbol = symbols[i];
-            resegment(symbol, output);
-        }
-    }
-
-private:
-    void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
-        auto text = std::string(symbol.text, symbol.n);
-        auto token = vocab.token_to_id.find(text);
-
-        // Do we need to support is_unused?
-        if (token != vocab.token_to_id.end()) {
-            output.push_back((*token).second);
-            return;
-        }
-
-        const auto p = rev_merge.find(text);
-
-        if (p == rev_merge.end()) {
-            // output any symbols that did not form tokens as bytes.
-            output.reserve(output.size() + symbol.n);
-            for (int j = 0; j < (int)symbol.n; ++j) {
-                llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
-                output.push_back(token_id);
-            }
-            return;
-        }
-
-        resegment(symbols[p->second.first],  output);
-        resegment(symbols[p->second.second], output);
-    }
-
-    void try_add_bigram(int left, int right) {
-        if (left == -1 || right == -1) {
-            return;
-        }
-
-        const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
-        auto token = vocab.token_to_id.find(text);
-
-        if (token == vocab.token_to_id.end()) {
-            return;
-        }
-
-        if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
-            return;
-        }
-
-        const auto & tok_data = vocab.id_to_token[(*token).second];
-
-        llm_bigram_spm bigram;
-        bigram.left  = left;
-        bigram.right = right;
-        bigram.score = tok_data.score;
-        bigram.size  = text.size();
-
-        work_queue.push(bigram);
-
-        // Do we need to support is_unused?
-        rev_merge[text] = std::make_pair(left, right);
-    }
-
-    const llama_vocab & vocab;
-
-    std::vector<llm_symbol> symbols;
-    llm_bigram_spm::queue work_queue;
-
-    std::map<std::string, std::pair<int, int>> rev_merge;
-};
-
-// BPE tokenizer
-// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
-// tried to simplify unicode stuff, so most likely does not work 100% correctly!
-
-// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
-
-struct llm_bigram_bpe {
-    struct comparator {
-        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
-            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
-        }
-    };
-
-    using queue_storage = std::vector<llm_bigram_bpe>;
-    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    std::string text;
-    int rank;
-    size_t size;
-};
-
-struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
-        GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
-        switch (vocab.type_pre) {
-            case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
-                regex_exprs = {
-                    // original regex from tokenizer.json
-                    //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-
-                    // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
-                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_DBRX:
-            case LLAMA_VOCAB_PRE_TYPE_SMAUG:
-                regex_exprs = {
-                    // same as llama3
-                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
-                regex_exprs = {
-                    "[\r\n]",
-                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
-                    "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
-                    "\\s+$",
-                    "[一-龥ࠀ-一가-퟿]+",
-                    "\\p{N}+",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
-                regex_exprs = {
-                    "[\r\n]",
-                    "\\s?\\p{L}+",
-                    "\\s?\\p{P}+",
-                    "[一-龥ࠀ-一가-퟿]+",
-                    "\\p{N}",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_FALCON:
-                regex_exprs = {
-                    "[\\p{P}\\$\\+<=>\\^~\\|`]+",
-                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                    "[0-9][0-9][0-9]",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
-            case LLAMA_VOCAB_PRE_TYPE_REFACT:
-            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
-                regex_exprs = {
-                    "\\p{N}",
-                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_GPT2:
-            case LLAMA_VOCAB_PRE_TYPE_MPT:
-            case LLAMA_VOCAB_PRE_TYPE_OLMO:
-            case LLAMA_VOCAB_PRE_TYPE_JAIS:
-                regex_exprs = {
-                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
-            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
-                regex_exprs = {
-                    // original regex from tokenizer.json
-                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_PORO:
-                regex_exprs = {
-                    " ?[^(\\s|.,!?…。，、।۔،)]+",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
-                regex_exprs = {
-                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                };
-                break;
-            case LLAMA_VOCAB_PRE_TYPE_VIKING:
-                regex_exprs = {
-                    " ?[^(\\s|.,!?…。，、।۔،)]+",
-                    "\\p{N}",
-                };
-                break;
-            default:
-                // default regex for BPE tokenization pre-processing
-                regex_exprs = {
-                    "[\\p{P}\\$\\+<=>\\^~\\|]+",
-                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                    "\\p{N}+",
-                    "[0-9][0-9][0-9]",
-                };
-                break;
-        }
-    }
-
-    void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
-        output.push_back(token_id);
-    }
-
-    bool append_bos(std::vector<llama_vocab::id> & output) const {
-        if (vocab.tokenizer_add_bos) {
-            GGML_ASSERT(vocab.special_bos_id != -1);
-            output.push_back(vocab.special_bos_id);
-            return true;
-        }
-        return false;
-    }
-
-    bool append_eos(std::vector<llama_vocab::id> & output) const {
-        if (vocab.tokenizer_add_eos) {
-            GGML_ASSERT(vocab.special_eos_id != -1);
-            output.push_back(vocab.special_eos_id);
-            return true;
-        }
-        return false;
-    }
-
-    void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
-        if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
-            LLAMA_LOG_WARN(
-                "%s: Added a BOS token to the prompt as specified by the model but the prompt "
-                "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
-                "Are you sure this is what you want?\n", __FUNCTION__);
-        }
-        if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
-            LLAMA_LOG_WARN(
-                "%s: Added a EOS token to the prompt as specified by the model but the prompt "
-                "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
-                "Are you sure this is what you want?\n", __FUNCTION__);
-        }
-    }
-
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        int final_prev_index = -1;
-
-        const auto word_collection = unicode_regex_split(text, regex_exprs);
-
-        symbols_final.clear();
-
-        for (auto & word : word_collection) {
-            work_queue = llm_bigram_bpe::queue();
-            symbols.clear();
-
-            int index = 0;
-            size_t offset = 0;
-
-            if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
-                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
-                offset = word.size();
-            }
-
-            while (offset < word.size()) {
-                llm_symbol sym;
-                size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
-                sym.text = word.c_str() + offset;
-                sym.n = char_len;
-                offset += sym.n;
-                sym.prev = index - 1;
-                sym.next = offset == word.size() ? -1 : index + 1;
-                index++;
-                symbols.emplace_back(sym);
-            }
-            for (size_t i = 1; i < symbols.size(); ++i) {
-                add_new_bigram(i - 1, i);
-            }
-
-            // build token(s)
-            while (!work_queue.empty()) {
-                auto bigram = work_queue.top();
-                work_queue.pop();
-
-                auto & left_symbol = symbols[bigram.left];
-                auto & right_symbol = symbols[bigram.right];
-
-                if (left_symbol.n == 0 || right_symbol.n == 0) {
-                    continue;
-                }
-                std::string left_token = std::string(left_symbol.text, left_symbol.n);
-                std::string right_token = std::string(right_symbol.text, right_symbol.n);
-                if (left_token + right_token != bigram.text) {
-                    continue;  // Skip this bigram if it's outdated
-                }
-
-                // merge the right sym into the left one
-                left_symbol.n += right_symbol.n;
-                right_symbol.n = 0;
-
-                // remove the right sym from the chain
-                left_symbol.next = right_symbol.next;
-                if (right_symbol.next >= 0) {
-                    symbols[right_symbol.next].prev = bigram.left;
-                }
-
-                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
-                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
-            }
-
-            // add the finished tokens to the final list keeping correct order for next and prev
-            for (auto & sym : symbols) {
-                if (sym.n > 0) {
-                    sym.prev = final_prev_index;
-                    sym.next = -1;
-                    if (final_prev_index != -1) {
-                        symbols_final[final_prev_index].next = symbols_final.size();
-                    }
-                    symbols_final.emplace_back(sym);
-                    final_prev_index = symbols_final.size() - 1;
-                }
-            }
-        }
-
-        symbols = symbols_final;
-
-        if (!symbols.empty()) {
-            for (int i = 0; i != -1; i = symbols[i].next) {
-                auto & symbol = symbols[i];
-                if (symbol.n == 0) {
-                    continue;
-                }
-
-                const std::string str = std::string(symbol.text, symbol.n);
-                const auto token = vocab.token_to_id.find(str);
-
-                if (token == vocab.token_to_id.end()) {
-                    for (auto j = str.begin(); j != str.end(); ++j) {
-                        std::string byte_str(1, *j);
-                        auto token_multibyte = vocab.token_to_id.find(byte_str);
-                        if (token_multibyte != vocab.token_to_id.end()) {
-                            output.push_back(token_multibyte->second);
-                        }
-                    }
-                } else {
-                    output.push_back((*token).second);
-                }
-            }
-        }
-    }
-
-private:
-    void add_new_bigram(int left, int right) {
-        if (left == -1 || right == -1) {
-            return;
-        }
-
-        std::string left_token  = std::string(symbols[left].text,  symbols[left].n);
-        std::string right_token = std::string(symbols[right].text, symbols[right].n);
-
-        int rank_found = -1;
-
-        rank_found = vocab.find_bpe_rank(left_token, right_token);
-
-        if (rank_found < 0) {
-            return;
-        }
-
-        llm_bigram_bpe bigram;
-
-        bigram.left  = left;
-        bigram.right = right;
-        bigram.text  = left_token + right_token;
-        bigram.size  = left_token.size() + right_token.size();
-        bigram.rank  = rank_found;
-
-        work_queue.push(bigram);
-    }
-
-    const llama_vocab & vocab;
-
-    std::vector<std::string> regex_exprs;
-
-    std::vector<llm_symbol> symbols;
-    std::vector<llm_symbol> symbols_final;
-
-    llm_bigram_bpe::queue work_queue;
-};
-
-struct llm_tokenizer_wpm {
-    llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
-
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
-        const auto & token_map = vocab.token_to_id;
-
-        // normalize and split by whitespace
-        std::vector<std::string> words = preprocess(text);
-
-        // bos token prepended already
-
-        // find the longest tokens that form the words
-        for (const std::string & word : words) {
-            // skip empty words
-            if (word.size() == 0) {
-                continue;
-            }
-
-            // prepend phantom space
-            const std::string word1 = "\xe2\x96\x81" + word;
-            const int n = word1.size();
-
-            const size_t current_tokens = output.size();
-
-            // we're at the start of a new word
-            // move through character position in word
-            for (int i = 0; i < n; ++i) {
-                // loop through possible match length
-                bool match = false;
-                for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
-                    auto it = token_map.find(word1.substr(i, j - i));
-                    if (it != token_map.end()) {
-                        output.push_back(it->second);
-                        match = true;
-                        i = j - 1;
-                        break;
-                    }
-                }
-
-                if (!match) { // discard all
-                    output.resize(current_tokens);
-                    break;  // and discard next tokens
-                }
-            }
-
-            // we didn't find any matches for this word
-            if (current_tokens == output.size()) {
-                output.push_back(vocab.special_unk_id);
-            }
-        }
-    }
-
-    // TODO: reduce string copies by using cpts_offs array
-    std::vector<std::string> preprocess(const std::string & text) const {
-        const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
-        std::vector<std::string> words(1, "");
-
-        for (const uint32_t cpt : cpts_nfd) {
-            const auto flags = unicode_cpt_flags(cpt);
-
-            if (flags.is_whitespace) {
-                if (words.back().size()) {  // finish previous word if any
-                    words.emplace_back();
-                }
-                continue;
-            }
-
-            assert (!flags.is_separator);
-            if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
-                continue;
-            }
-
-            const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
-            if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
-                if (words.back().size()) {  // finish previous word if any
-                    words.emplace_back();
-                }
-                words.back() = s;       // single char word
-                words.emplace_back();   // start a new word
-            } else {
-                words.back() += s;  // append char to word
-            }
-        }
-
-        if (!words.back().size()) {
-            words.pop_back();
-        }
-
-        return words;
-    }
-
-    static bool is_chinese_char(uint32_t cpt) {
-        return
-            (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
-            (cpt >= 0x03400 && cpt <= 0x04DBF) ||
-            (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
-            (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
-            (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
-            (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
-            (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
-            (cpt >= 0x2F800 && cpt <= 0x2FA1F);
-            //(cpt >= 0x3000  && cpt <= 0x303F)  ||
-            //(cpt >= 0xFF00  && cpt <= 0xFFEF);
-    }
-
-    const llama_vocab & vocab;
-};
-
-struct naive_trie {
-    naive_trie() : has_value(false), value(0) {
-    }
-    void insert(const char * key, size_t len, int32_t value = 0) {
-        if (len == 0) {
-            this->has_value = true;
-            this->value = value;
-            return;
-        }
-        char c = key[0];
-        auto res = children.find(c);
-        if (res != children.end()) {
-            res->second.insert(key + 1, len - 1, value);
-        } else {
-            auto res = children.insert(std::make_pair(c, naive_trie()));
-            res.first->second.insert(key + 1, len - 1, value);
-        }
-    }
-    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) {
-        if (len == 0 || offset == len) {
-            return std::make_pair(key, offset);
-        }
-        char c = key[offset];
-        auto res = children.find(c);
-        if (res != children.end()) {
-            return res->second.get_longest_prefix(key, len, offset + 1);
-        } else {
-            return std::make_pair(key, offset);
-        }
-    }
-    struct naive_trie * traverse(const char c) {
-        auto res = children.find(c);
-        if (res != children.end()) {
-            return &res->second;
-        } else {
-            return NULL;
-        }
-    }
-    std::map<char, struct naive_trie> children;
-    bool has_value;
-    llama_token value;
-};
-
-struct llm_tokenizer_ugm {
-    llm_tokenizer_ugm(const llama_vocab & vocab) : vocab(vocab) {
-        if (vocab.precompiled_charsmap.size() > 0) {
-            size_t charsmap_offset = 0;
-
-            // First four bytes of precompiled_charsmap contains length of binary
-            // blob containing XOR-compressed compact double array (XCDA) entries
-            uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
-            charsmap_offset += sizeof(xcda_blob_size);
-            if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
-                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
-            }
-
-            // Next xcda_blob_size bytes contain entries of XOR-compressed compact
-            // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
-            xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
-            xcda_array_size = xcda_blob_size / sizeof(uint32_t);
-            charsmap_offset += xcda_blob_size;
-
-            // Remaining bytes of precompiled charsmap contain null-terminated
-            // replacement strings for prefixes matched by the XCDA.
-            prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
-            prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
-        }
-
-        for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
-            const auto &token_data = vocab.id_to_token[id];
-
-            if (llama_is_normal_token(vocab, id)) {
-                min_score = std::min<float>(min_score, token_data.score);
-                max_score = std::max<float>(max_score, token_data.score);
-            }
-
-            if (llama_is_normal_token(vocab, id) ||
-                llama_is_user_defined_token(vocab, id) ||
-                llama_is_unused_token(vocab, id)) {
-                token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
-            }
-
-            if (llama_is_user_defined_token(vocab, id)) {
-                user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
-            }
-        }
-
-        unknown_token_score = min_score - unknown_token_score_penalty;
-    }
-
-    /* This implementation is based on SentencePiece optimized Viterbi algorithm for
-     * unigram language models. The general idea is to:
-     * - move along the input sequence in steps of one UTF code point,
-     * - at each step find all possible tokenizations of the prefix by
-     *   traversing the tokens trie,
-     * - for each tokenization store the best one so far (by higher score)
-     * - use the position in sequence after given token as an index to store
-     *   results
-     * - if there was no valid tokenization of the current UTF code point
-     *   then use unknown token with additional score penalty
-     * After processing the whole sequence we backtrack from the end to get
-     * the best tokenization.
-    */
-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-        // normalize the input first
-        std::string normalized;
-        normalize(text, &normalized);
-        size_t input_len = normalized.size();
-        if (input_len == 0) {
-            return;
-        }
-
-        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
-        // at the beginning tokenization score is zero
-        tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
-
-        for (size_t input_offset = 0; input_offset < input_len;) {
-            size_t prefix_offset = input_offset;
-            // calculate how many code units are in the currently processed UTF code point
-            size_t n_utf8_code_units = std::min<size_t>(utf8_len(normalized[input_offset]), input_len - input_offset);
-
-            // traverse the token matcher trie to find a matching token
-            bool single_codepoint_token_found = false;
-            const struct best_tokenization & current_best = tokenization_results[input_offset];
-            struct naive_trie * node  = token_matcher.traverse(normalized[prefix_offset++]);
-
-            while (prefix_offset <= input_len && node != NULL) {
-                // check if we found valid token in prefix
-                if (node->has_value) {
-                    // check if it corresponds to the whole UTF code point
-                    if (prefix_offset - input_offset == n_utf8_code_units) {
-                        single_codepoint_token_found = true;
-                    }
-                    llama_token token_id = node->value;
-                    const auto & token_data = vocab.id_to_token[token_id];
-
-                    // we set the user-defined token scores to 0 to make them more likely to be selected
-                    // (normal token scores are log probabilities, so they are negative)
-                    // score type is double here to make tokenization results exactly
-                    // the same as in the HF tokenizer using SentencePiece
-                    const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
-                    const double challenger_score = current_best.score_sum + token_score;
-                    struct best_tokenization & current_champ = tokenization_results[prefix_offset];
-                    if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
-                        current_champ = challenger;
-                    }
-                }
-                node = node->traverse(normalized[prefix_offset++]);
-            }
-
-            // if we didn't find a valid token corresponding to the whole UTF code point
-            // then use unknown token as the tokenization of this UTF code point
-            if (!single_codepoint_token_found) {
-                const double challenger_score = current_best.score_sum + unknown_token_score;
-                prefix_offset = input_offset + n_utf8_code_units;
-                struct best_tokenization & current_champ = tokenization_results[prefix_offset];
-                if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
-                    current_champ = challenger;
-                }
-            }
-
-            // move to the next UTF code point
-            input_offset += n_utf8_code_units;
-        }
-
-        // now backtrack from the end to gather token ids of the best tokenization
-        // merge sequences of consecutive unknown tokens into single unknown tokens
-        bool is_prev_unknown = false;
-        for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
-            bool is_unknown = tokenization.token_id == vocab.special_unk_id;
-            if (!(is_prev_unknown && is_unknown)) {
-                output.push_back(tokenization.token_id);
-            }
-            if (tokenization.input_offset == 0) {
-                break;
-            }
-            is_prev_unknown = is_unknown;
-        }
-
-        // reverse the output since we added tokens starting from the end of the input
-        std::reverse(output.begin(), output.end());
-    }
-
-private:
-    const llama_vocab & vocab;
-
-    // helper structure for returning normalization results
-    struct normalization_result {
-        const char * normalized;
-        size_t normalized_len;
-        size_t consumed_input;
-    };
-
-    void normalize(const std::string& input, std::string * normalized) {
-        normalized->clear();
-        normalized->reserve(input.size() * 3);
-
-        const std::string space = vocab.tokenizer_escape_whitespaces ? escaped_space : " ";
-
-        bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
-        bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
-        bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
-
-        bool is_space_prepended = false;
-        bool processing_non_ws = false;
-
-        size_t input_len = input.size();
-
-        for (size_t input_offset = 0; input_offset < input_len; ) {
-            auto norm_res = normalize_prefix(input, input_offset);
-            for (size_t i = 0; i < norm_res.normalized_len; i++) {
-                char c = norm_res.normalized[i];
-                if (c != ' ') {
-                    if (!processing_non_ws) {
-                        processing_non_ws = true;
-                        if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
-                            normalized->append(space);
-                            is_space_prepended = true;
-                        }
-                    }
-                    normalized->push_back(c);
-                } else {
-                    if (processing_non_ws) {
-                        processing_non_ws = false;
-                    }
-                    if (!shall_merge_spaces) {
-                        normalized->append(space);
-                    }
-                }
-            }
-
-            input_offset += norm_res.consumed_input;
-        }
-
-        if (shall_append_space) {
-            normalized->append(space);
-        }
-    }
-
-    /*
-     * This structure is a view wrapper for XOR-compressed double array (XCDA)
-     * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
-     * Eeach bit-packed entry contains:
-     * - BASE array value in bits 10-30
-     * - LCHECK array value in bits 0-7
-     * - LEAF array value in bit 9
-     * Entries containing indexes of replacement sequences have set bit 31
-     */
-    struct xcda_array_view {
-    public:
-        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
-        }
-        uint32_t get_base(size_t index) {
-            uint32_t packed_node = get_node(index);
-            return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
-        }
-        uint32_t get_lcheck(size_t index) {
-            uint32_t packed_node = get_node(index);
-            return packed_node & ((1U << 31) | 0xff);
-        }
-        bool get_leaf(size_t index) {
-            uint32_t packed_node = get_node(index);
-            return (packed_node >> 8) & 1;
-        }
-        uint32_t get_value(size_t index) {
-            uint32_t packed_node = get_node(index);
-            return packed_node & ((1U << 31) - 1);
-        }
-    private:
-        uint32_t get_node(size_t index) {
-            if (index > xcda_array_size) {
-                throw std::runtime_error("Index out of array bounds in XCDA array!");
-            }
-            return xcda_array[index];
-        }
-        const uint32_t * xcda_array;
-        size_t xcda_array_size;
-    };
-
-    struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
-        if (input_offset == input.size()) {
-            return { &input[input_offset], 0, 0 };
-        }
-
-        // if input prefix matches some user-defined token return this token as normalization result
-        auto user_defined_token_match = user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
-        if (user_defined_token_match.second > 0) {
-            return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
-        }
-
-        size_t longest_prefix_length = 0;
-        size_t longest_prefix_offset = 0;
-
-        if (xcda_array_size > 0) {
-            struct xcda_array_view xcda_view(xcda_array, xcda_array_size);
-
-            // Find the longest normalized sequence matching the input prefix by walking
-            // the XOR-compressed compact double array (XCDA) starting from the root node
-            // We find the index of the next node by calculating BASE[s] ^ c where s is
-            // the index of the previous node and c is a numerical character value
-            uint32_t node_index = 0;
-            // get BASE of the root node
-            node_index = xcda_view.get_base(node_index);
-            for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
-                unsigned char c = input[prefix_offset];
-                if (c == 0) {
-                    break;
-                }
-                node_index ^= c;
-                // if value of LCHECK is not c it means that this is not a child of
-                // the previous node, so we stop matching
-                if (xcda_view.get_lcheck(node_index) != c) {
-                    break;
-                }
-                bool is_leaf = xcda_view.get_leaf(node_index);
-                // get BASE of the current node
-                node_index ^= xcda_view.get_base(node_index);
-                // if LEAF of the current node is true, it means that its BASE points to the node
-                // containing index of replacement sequence for currently matched input prefix
-                if (is_leaf)
-                {
-                    longest_prefix_length = prefix_offset - input_offset + 1;
-                    // get index of replacement sequence for currently matched input prefix
-                    longest_prefix_offset = xcda_view.get_value(node_index);
-                }
-            }
-        }
-
-        if (longest_prefix_length > 0) {
-            // we have a match, so return the replacement sequence
-            if (longest_prefix_offset >= prefix_replacements_size) {
-                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
-            }
-            const char * prefix_replacement = &prefix_replacements[longest_prefix_offset];
-            return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
-        } else {
-            // check if the input prefix contains a valid sequence of UTF-8 code units
-            try {
-                // if yes, return this sequence unmodified
-                size_t prefix_offset = input_offset;
-                unicode_cpt_from_utf8(input, prefix_offset);
-                return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
-            } catch (std::invalid_argument & /*ex*/) {
-                // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
-                return { "\xEF\xBF\xBD", 3, 1 };
-            }
-        }
-    }
-
-    // escaped space symbol - U+2581 (Lower One Eighth Block)
-    const std::string escaped_space = "\xE2\x96\x81";
-
-    const char * prefix_replacements = NULL;
-    size_t prefix_replacements_size = 0;
-
-    const uint32_t * xcda_array = NULL;
-    size_t xcda_array_size = 0;
-
-    struct naive_trie user_defined_token_matcher;
-
-    // this structure stores the best tokenization so far at input_offset
-    struct best_tokenization {
-        llama_token token_id;
-        size_t input_offset;
-        float score_sum;
-    };
-
-    float min_score = FLT_MAX;
-    float max_score = -FLT_MAX;
-
-    float unknown_token_score_penalty = 10.0;
-    float unknown_token_score;
-
-    struct naive_trie token_matcher;
-};
-
-
-typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
-    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
-    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
-} FRAGMENT_BUFFER_VARIANT_TYPE;
-
-struct fragment_buffer_variant {
-    fragment_buffer_variant(llama_vocab::id _token)
-    :
-        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
-        token(_token),
-        raw_text(_dummy),
-        offset(0),
-        length(0) {}
-
-    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
-    :
-        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
-        token((llama_vocab::id) - 1),
-        raw_text(_raw_text),
-        offset(_offset),
-        length(_length){
-            GGML_ASSERT(_offset >= 0);
-            GGML_ASSERT(_length >= 1);
-            GGML_ASSERT(offset + length <= raw_text.length());
-        }
-
-    const FRAGMENT_BUFFER_VARIANT_TYPE type;
-    const llama_vocab::id token;
-    const std::string _dummy;
-    const std::string & raw_text;
-    const uint64_t offset;
-    const uint64_t length;
-};
-
-// #define PRETOKENIZERDEBUG
-
-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
-    // for each special token
-    for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
-        const auto & data = vocab.id_to_token[special_id];
-        const auto & special_token = data.text;
-
-        if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
-            // Ignore control and unknown tokens when parse_special == false
-            continue;
-            // User-defined tokens are still pre-tokenized before everything else
-            // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
-            // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
-        }
-
-        // for each text fragment
-        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
-        while (it != buffer.end()) {
-            auto & fragment = (*it);
-
-            // if a fragment is text ( not yet processed )
-            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                auto & raw_text = fragment.raw_text;
-
-                auto raw_text_base_offset = fragment.offset;
-                auto raw_text_base_length = fragment.length;
-
-                // loop over the text
-                while (true) {
-                    // find the first occurrence of a given special token in this fragment
-                    //  passing offset argument only limit the "search area" but match coordinates
-                    //  are still relative to the source full raw_text
-                    auto match = raw_text.find(special_token, raw_text_base_offset);
-
-                    // no occurrences found, stop processing this fragment for a given special token
-                    if (match == std::string::npos) break;
-
-                    // check if match is within bounds of offset <-> length
-                    if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
-
-#ifdef PRETOKENIZERDEBUG
-                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
-#endif
-                    auto source = std::distance(buffer.begin(), it);
-
-                    // if match is further than base offset
-                    //  then we have some text to the left of it
-                    if (match > raw_text_base_offset) {
-                        // left
-                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
-                        int64_t left_reminder_length = match - raw_text_base_offset;
-
-                        if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
-                            while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
-                                left_reminder_length--;
-                            }
-                        }
-
-                        if (left_reminder_length > 0) {
-                            buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
-                            it++;
-                        }
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
-#endif
-                    }
-
-                    // special token
-                    buffer.emplace_after(it, special_id);
-                    it++;
-
-                    // right
-                    if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
-                        int64_t right_reminder_offset = match + special_token.length();
-                        int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
-
-                        if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
-                            while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
-                                right_reminder_offset++;
-                                right_reminder_length--;
-                            }
-                        }
-
-                        if (right_reminder_length > 0) {
-                            buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
-                            it++;
-                        }
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
-#endif
-
-                        if (source == 0) {
-                            buffer.erase_after(buffer.before_begin());
-                        } else {
-                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
-                        }
-
-                        // repeat for the right side
-                        raw_text_base_offset = right_reminder_offset;
-                        raw_text_base_length = right_reminder_length;
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
-#endif
-                    } else {
-                        if (source == 0) {
-                            buffer.erase_after(buffer.before_begin());
-                        } else {
-                            buffer.erase_after(std::next(buffer.begin(), (source-1)));
-                        }
-                        break;
-                    }
-                }
-            }
-            it++;
-        }
-    }
-}
-
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) {
-    std::vector<llama_vocab::id> output;
-    std::forward_list<fragment_buffer_variant> fragment_buffer;
-
-    if (!raw_text.empty()) {
-        fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
-        tokenizer_st_partition(vocab, fragment_buffer, parse_special);
-    }
-
-    switch (vocab.type) {
-        case LLAMA_VOCAB_TYPE_SPM:
-            {
-                // OG tokenizer behavior:
-                //
-                // tokenizer.encode('', add_special_tokens=True)  returns [1]
-                // tokenizer.encode('', add_special_tokens=False) returns []
-
-                bool is_prev_special = true;  // prefix with space if first token
-
-                if (add_special && vocab.tokenizer_add_bos) {
-                    GGML_ASSERT(vocab.special_bos_id != -1);
-                    output.push_back(vocab.special_bos_id);
-                    is_prev_special = true;
-                }
-
-                for (const auto & fragment : fragment_buffer) {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-
-                        // prefix with space if previous is special
-                        if (vocab.tokenizer_add_space_prefix && is_prev_special) {
-                            raw_text = " " + raw_text;
-                        }
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
-                        llm_tokenizer_spm tokenizer(vocab);
-                        llama_escape_whitespace(raw_text);
-                        tokenizer.tokenize(raw_text, output);
-                        is_prev_special = false;
-                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        output.push_back(fragment.token);
-                        is_prev_special = true;
-                    }
-                }
-
-                if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
-                    LLAMA_LOG_WARN(
-                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
-                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
-                        "Are you sure this is what you want?\n", __FUNCTION__);
-                }
-
-                if (add_special && vocab.tokenizer_add_eos) {
-                    GGML_ASSERT(vocab.special_eos_id != -1);
-                    output.push_back(vocab.special_eos_id);
-                }
-            } break;
-        case LLAMA_VOCAB_TYPE_BPE:
-            {
-                llm_tokenizer_bpe tokenizer(vocab);
-
-                if (add_special) {
-                    tokenizer.append_bos(output);
-                }
-                for (const auto & fragment : fragment_buffer) {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
-                        tokenizer.tokenize(raw_text, output);
-                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        tokenizer.append(fragment.token, output);
-                    }
-                }
-
-                if (add_special) {
-                    tokenizer.append_eos(output);
-                    tokenizer.check_double_bos_eos(output);
-                }
-            } break;
-        case LLAMA_VOCAB_TYPE_WPM:
-            {
-                if (add_special) {
-                    GGML_ASSERT(vocab.special_cls_id != -1);
-                    output.push_back(vocab.special_cls_id);
-                }
-
-                llm_tokenizer_wpm tokenizer(vocab);
-
-                for (const auto & fragment : fragment_buffer) {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
-                        tokenizer.tokenize(raw_text, output);
-                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        output.push_back(fragment.token);
-                    }
-                }
-
-                if (add_special) {
-                    GGML_ASSERT(vocab.special_sep_id != -1);
-                    output.push_back(vocab.special_sep_id);
-                }
-            } break;
-        case LLAMA_VOCAB_TYPE_UGM:
-            {
-                llm_tokenizer_ugm tokenizer(vocab);
-
-                if (add_special && vocab.tokenizer_add_bos != 0) {
-                    GGML_ASSERT(vocab.special_bos_id != -1);
-                    output.push_back(vocab.special_bos_id);
-                }
-
-                for (const auto & fragment : fragment_buffer) {
-                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
-                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-#ifdef PRETOKENIZERDEBUG
-                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
-                        tokenizer.tokenize(raw_text, output);
-                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
-                        output.push_back(fragment.token);
-                    }
-                }
-
-                if (add_special && vocab.tokenizer_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
-                    LLAMA_LOG_WARN(
-                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
-                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
-                        "Are you sure this is what you want?\n", __FUNCTION__);
-                }
-
-                if (add_special && vocab.tokenizer_add_eos == 1) {
-                    GGML_ASSERT(vocab.special_eos_id != -1);
-                    output.push_back(vocab.special_eos_id);
-                }
-            } break;
-        case LLAMA_VOCAB_TYPE_NONE:
-            GGML_ASSERT(false);
-    }
-
-    return output;
-}
-
-//
-// grammar - internal
-//
-
-
-// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
-// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
-std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-        const std::string & src,
-        llama_partial_utf8   partial_start) {
-    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
-    const char          * pos      = src.c_str();
-    std::vector<uint32_t> code_points;
-    // common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
-    code_points.reserve(src.size() + 1);
-    uint32_t              value    = partial_start.value;
-    int                   n_remain = partial_start.n_remain;
-
-    // continue previous decode, if applicable
-    while (*pos != 0 && n_remain > 0) {
-        uint8_t next_byte = static_cast<uint8_t>(*pos);
-        if ((next_byte >> 6) != 2) {
-            // invalid sequence, abort
-            code_points.push_back(0);
-            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
-        }
-        value = (value << 6) + (next_byte & 0x3F);
-        ++pos;
-        --n_remain;
-    }
-
-    if (partial_start.n_remain > 0 && n_remain == 0) {
-        code_points.push_back(value);
-    }
-
-    // decode any subsequent utf-8 sequences, which may end in an incomplete one
-    while (*pos != 0) {
-        uint8_t  first_byte = static_cast<uint8_t>(*pos);
-        uint8_t  highbits   = first_byte >> 4;
-                 n_remain   = lookup[highbits] - 1;
-
-        if (n_remain < 0) {
-            // invalid sequence, abort
-            code_points.clear();
-            code_points.push_back(0);
-            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
-        }
-
-        uint8_t  mask       = (1 << (7 - n_remain)) - 1;
-                 value      = first_byte & mask;
-        ++pos;
-        while (*pos != 0 && n_remain > 0) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-            ++pos;
-            --n_remain;
-        }
-        if (n_remain == 0) {
-            code_points.push_back(value);
-        }
-    }
-    code_points.push_back(0);
-
-    return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
-}
-
-// returns true iff pos points to the end of one of the definitions of a rule
-static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
-    switch (pos->type) {
-        case LLAMA_GRETYPE_END: return true;  // NOLINT
-        case LLAMA_GRETYPE_ALT: return true;  // NOLINT
-        default:                return false;
-    }
-}
-
-// returns true iff chr satisfies the char range at pos (regular or inverse range)
-// asserts that pos is pointing to a char range element
-static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
-        const llama_grammar_element * pos,
-        const uint32_t                chr) {
-
-    bool found            = false;
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
-
-    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
-
-    do {
-        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
-            // inclusive range, e.g. [a-z]
-            found = found || (pos->value <= chr && chr <= pos[1].value);
-            pos += 2;
-        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
-            // Any character matches "."
-            found = true;
-            pos += 1;
-        } else {
-            // exact char match, e.g. [a] or "a"
-            found = found || pos->value == chr;
-            pos += 1;
-        }
-    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
-
-    return std::make_pair(found == is_positive_char, pos);
-}
-
-// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
-// range at pos (regular or inverse range)
-// asserts that pos is pointing to a char range element
-static bool llama_grammar_match_partial_char(
-        const llama_grammar_element * pos,
-        const llama_partial_utf8      partial_utf8) {
-
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
-    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
-
-    uint32_t partial_value = partial_utf8.value;
-    int      n_remain      = partial_utf8.n_remain;
-
-    // invalid sequence or 7-bit char split across 2 bytes (overlong)
-    if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
-        return false;
-    }
-
-    // range of possible code points this partial UTF-8 sequence could complete to
-    uint32_t low  = partial_value << (n_remain * 6);
-    uint32_t high = low | ((1 << (n_remain * 6)) - 1);
-
-    if (low == 0) {
-        if (n_remain == 2) {
-            low = 1 << 11;
-        } else if (n_remain == 3) {
-            low = 1 << 16;
-        }
-    }
-
-    do {
-        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
-            // inclusive range, e.g. [a-z]
-            if (pos->value <= high && low <= pos[1].value) {
-                return is_positive_char;
-            }
-            pos += 2;
-        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
-            // Any character matches "."
-            return true;
-        } else {
-            // exact char match, e.g. [a] or "a"
-            if (low <= pos->value && pos->value <= high) {
-                return is_positive_char;
-            }
-            pos += 1;
-        }
-    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
-
-    return !is_positive_char;
-}
-
-
-// transforms a grammar pushdown stack into N possible stacks, all ending
-// at a character range (terminal element)
-static void llama_grammar_advance_stack(
-        const std::vector<std::vector<llama_grammar_element>>   & rules,
-        const std::vector<const llama_grammar_element *>        & stack,
-        std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
-
-    if (stack.empty()) {
-        if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
-            new_stacks.emplace_back(stack);
-        }
-        return;
-    }
-
-    const llama_grammar_element * pos = stack.back();
-
-    switch (pos->type) {
-        case LLAMA_GRETYPE_RULE_REF: {
-            const size_t                  rule_id = static_cast<size_t>(pos->value);
-            const llama_grammar_element * subpos  = rules[rule_id].data();
-            do {
-                // init new stack without the top (pos)
-                std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
-                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
-                    // if this rule ref is followed by another element, add that to stack
-                    new_stack.push_back(pos + 1);
-                }
-                if (!llama_grammar_is_end_of_sequence(subpos)) {
-                    // if alternate is nonempty, add to stack
-                    new_stack.push_back(subpos);
-                }
-                llama_grammar_advance_stack(rules, new_stack, new_stacks);
-                while (!llama_grammar_is_end_of_sequence(subpos)) {
-                    // scan to end of alternate def
-                    subpos++;
-                }
-                if (subpos->type == LLAMA_GRETYPE_ALT) {
-                    // there's another alternate def of this rule to process
-                    subpos++;
-                } else {
-                    break;
-                }
-            } while (true);
-            break;
-        }
-        case LLAMA_GRETYPE_CHAR:
-        case LLAMA_GRETYPE_CHAR_NOT:
-        case LLAMA_GRETYPE_CHAR_ANY:
-            if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
-                // only add the stack if it's not a duplicate of one we already have
-                new_stacks.emplace_back(stack);
-            }
-            break;
-        default:
-            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
-            // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
-            // those
-            GGML_ASSERT(false);
-    }
-}
-
-// takes a set of possible pushdown stacks on a grammar, which are required to
-// be positioned at a character range (see `llama_grammar_advance_stack`), and
-// produces the N possible stacks if the given char is accepted at those
-// positions
-void llama_grammar_accept(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const uint32_t                                                  chr,
-        std::vector<std::vector<const llama_grammar_element *>>       & new_stacks) {
-
-    new_stacks.clear();
-
-    for (const auto & stack : stacks) {
-        if (stack.empty()) {
-            continue;
-        }
-
-        auto match = llama_grammar_match_char(stack.back(), chr);
-        if (match.first) {
-            const llama_grammar_element * pos = match.second;
-
-            // update top of stack to next element, if any
-            std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
-            if (!llama_grammar_is_end_of_sequence(pos)) {
-                new_stack.push_back(pos);
-            }
-            llama_grammar_advance_stack(rules, new_stack, new_stacks);
-        }
-    }
-}
-
-static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const std::vector<llama_grammar_candidate>                    & candidates);
-
-static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
-        const std::vector<std::vector<llama_grammar_element>> & rules,
-        const std::vector<const llama_grammar_element *>      & stack,
-        const std::vector<llama_grammar_candidate>            & candidates) {
-
-    std::vector<llama_grammar_candidate> rejects;
-    rejects.reserve(candidates.size());
-
-    if (stack.empty()) {
-        for (const auto & tok : candidates) {
-            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
-                rejects.push_back(tok);
-            }
-        }
-        return rejects;
-    }
-
-    const llama_grammar_element * stack_pos = stack.back();
-
-    std::vector<llama_grammar_candidate> next_candidates;
-    next_candidates.reserve(candidates.size());
-
-    for (const auto & tok : candidates) {
-        if (*tok.code_points == 0) {
-            // reached end of full codepoints in token, reject iff it ended in a partial sequence
-            // that cannot satisfy this position in grammar
-            if (tok.partial_utf8.n_remain != 0 &&
-                    !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
-                rejects.push_back(tok);
-            }
-        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
-            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
-        } else {
-            rejects.push_back(tok);
-        }
-    }
-
-    const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
-
-    // update top of stack to next element, if any
-    std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
-    if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
-        stack_after.push_back(stack_pos_after);
-    }
-    std::vector<std::vector<const llama_grammar_element *>> next_stacks;
-    llama_grammar_advance_stack(rules, stack_after, next_stacks);
-
-    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
-    for (const auto & tok : next_rejects) {
-        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
-    }
-
-    return rejects;
-}
-
-static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
-        const std::vector<std::vector<llama_grammar_element>>         & rules,
-        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
-        const std::vector<llama_grammar_candidate>                    & candidates) {
-    GGML_ASSERT(!stacks.empty()); // REVIEW
-
-    if (candidates.empty()) {
-        return std::vector<llama_grammar_candidate>();
-    }
-
-    auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
-
-    for (size_t i = 1, size = stacks.size(); i < size; ++i) {
-        rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
-    }
-    return rejects;
-}
-
-static bool llama_grammar_detect_left_recursion(
-        const std::vector<std::vector<llama_grammar_element>> & rules,
-        size_t                                                  rule_index,
-        std::vector<bool>                                     * rules_visited,
-        std::vector<bool>                                     * rules_in_progress,
-        std::vector<bool>                                     * rules_may_be_empty) {
-    if ((*rules_in_progress)[rule_index]) {
-        return true;
-    }
-
-    (*rules_in_progress)[rule_index] = true;
-
-    const std::vector<llama_grammar_element> & rule = rules[rule_index];
-
-    // First check if the rule might produce the empty string. This could be done combined with the second
-    // step but it's more readable as two steps.
-    bool at_rule_start = true;
-    for (size_t i = 0; i < rule.size(); i++) {
-        if (llama_grammar_is_end_of_sequence(&rule[i])) {
-            if (at_rule_start) {
-                (*rules_may_be_empty)[rule_index] = true;
-                break;
-            }
-            at_rule_start = true;
-        } else {
-            at_rule_start = false;
-        }
-    }
-
-    // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
-    // be empty)
-    bool recurse_into_nonterminal = true;
-    for (size_t i = 0; i < rule.size(); i++) {
-        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
-            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
-                return true;
-            }
-            if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
-                recurse_into_nonterminal = false;
-            }
-        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
-            recurse_into_nonterminal = true;
-        } else {
-            recurse_into_nonterminal = false;
-        }
-    }
-
-    (*rules_in_progress)[rule_index] = false;
-    (*rules_visited)[rule_index] = true;
-    return false;
-}
-
-//
-// grammar - external
-//
-
-struct llama_grammar * llama_grammar_init(
-            const llama_grammar_element ** rules,
-                                 size_t    n_rules,
-                                 size_t    start_rule_index) {
-    const llama_grammar_element * pos;
-
-    // copy rule definitions into vectors
-    std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
-    for (size_t i = 0; i < n_rules; i++) {
-        for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
-            vec_rules[i].push_back(*pos);
-        }
-        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
-    }
-
-    // Check for left recursion
-    std::vector<bool> rules_visited(n_rules);
-    std::vector<bool> rules_in_progress(n_rules);
-    std::vector<bool> rules_may_be_empty(n_rules);
-    for (size_t i = 0; i < n_rules; i++) {
-        if (rules_visited[i]) {
-            continue;
-        }
-        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
-            LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
-            return nullptr;
-        }
-    }
-
-    // loop over alternates of start rule to build initial stacks
-    std::vector<std::vector<const llama_grammar_element *>> stacks;
-    pos = vec_rules[start_rule_index].data();
-    do {
-        std::vector<const llama_grammar_element *> stack;
-        if (!llama_grammar_is_end_of_sequence(pos)) {
-            // if alternate is nonempty, add to stack
-            stack.push_back(pos);
-        }
-        llama_grammar_advance_stack(vec_rules, stack, stacks);
-        while (!llama_grammar_is_end_of_sequence(pos)) {
-            // scan to end of alternate def
-            pos++;
-        }
-        if (pos->type == LLAMA_GRETYPE_ALT) {
-            // there's another alternate def of this rule to process
-            pos++;
-        } else {
-            break;
-        }
-    } while (true);
-
-    // Important: vec_rules has to be moved here, not copied, because stacks contains
-    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
-    // then the pointers would be invalidated when the local vec_rules goes out of scope.
-    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
-}
-
-void llama_grammar_free(struct llama_grammar * grammar) {
-    delete grammar;
-}
-
-struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
-
-    // redirect elements in stacks to point to new rules
-    for (size_t is = 0; is < result->stacks.size(); is++) {
-        for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
-            for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
-                for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
-                    if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
-                         result->stacks[is][ie]  =  &result->rules[ir0][ir1];
-                    }
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
-//
-// sampling
-//
-
-void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = time(NULL);
-    }
-    ctx->rng.seed(seed);
-}
-
-void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
-    GGML_ASSERT(candidates->size > 0);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Sort the logits in descending order
-    if (!candidates->sorted) {
-        std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        });
-        candidates->sorted = true;
-    }
-
-    float max_l = candidates->data[0].logit;
-    float cum_sum = 0.0f;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        float p = expf(candidates->data[i].logit - max_l);
-        candidates->data[i].p = p;
-        cum_sum += p;
-    }
-    for (size_t i = 0; i < candidates->size; ++i) {
-        candidates->data[i].p /= cum_sum;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
-    // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
-    // if (k >= (int32_t)candidates->size) {
-    //     return;
-    // }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    if (k <= 0) {
-        k = candidates->size;
-    }
-
-    k = std::max(k, (int) min_keep);
-    k = std::min(k, (int) candidates->size);
-
-    // Sort scores in descending order
-    if (!candidates->sorted) {
-        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        };
-        if (k <= 128) {
-            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
-        } else {
-            constexpr int   nbuckets     = 128;
-            constexpr float bucket_low   = -10.0f;
-            constexpr float bucket_high  =  10.0f;
-            constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
-            constexpr float bucker_inter = -bucket_low * bucket_scale;
-
-            std::vector<int> bucket_idx(candidates->size);
-            std::vector<int> histo(nbuckets, 0);
-
-            for (int i = 0; i < (int)candidates->size; ++i) {
-                const float val = candidates->data[i].logit;
-                int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
-                ib = std::max(0, std::min(nbuckets-1, ib));
-                bucket_idx[i] = ib;
-                ++histo[ib];
-            }
-            int nhave = 0;
-            int ib = nbuckets - 1;
-            for ( ; ib >= 0; --ib) {
-                nhave += histo[ib];
-                if (nhave >= k) break;
-            }
-            std::vector<llama_token_data> tmp_tokens(nhave);
-            auto ptr = tmp_tokens.data();
-            std::vector<llama_token_data*> bucket_ptrs;
-            bucket_ptrs.reserve(nbuckets - ib);
-            for (int j = nbuckets - 1; j >= ib; --j) {
-                bucket_ptrs.push_back(ptr);
-                ptr += histo[j];
-            }
-            for (int i = 0; i < (int)candidates->size; ++i) {
-                int j = bucket_idx[i];
-                if (j >= ib) {
-                    *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i];
-                }
-            }
-
-            ptr = tmp_tokens.data();
-            int ndone = 0;
-            for (int j = nbuckets-1; j > ib; --j) {
-                std::sort(ptr, ptr + histo[j], comp);
-                ptr += histo[j];
-                ndone += histo[j];
-            }
-            std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp);
-
-            std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data));
-
-        }
-        candidates->sorted = true;
-    }
-    candidates->size = k;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    if (p >= 1.0f) {
-        return;
-    }
-
-    llama_sample_softmax(ctx, candidates);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = candidates->size;
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        cum_sum += candidates->data[i].p;
-
-        // Check if the running sum is at least p or if we have kept at least min_keep tokens
-        // we set the last index to i+1 to indicate that the current iterate should be included in the set
-        if (cum_sum >= p && i + 1 >= min_keep) {
-            last_idx = i + 1;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the top-p tokens
-    candidates->size = last_idx;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    if (p <= 0.0f || !candidates->size) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    bool min_p_applied = false;
-
-    // if the candidates aren't sorted, try the unsorted implementation first
-    if (!candidates->sorted) {
-        std::vector<llama_token_data> filtered_tokens;
-
-        float max_logit = -FLT_MAX;
-        for (size_t i = 0; i < candidates->size; ++i) {
-            max_logit = std::max(max_logit, candidates->data[i].logit);
-        }
-        const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
-
-        for (size_t i = 0; i < candidates->size; ++i) {
-            if (candidates->data[i].logit >= min_logit) {
-                filtered_tokens.push_back(candidates->data[i]);
-            }
-        }
-
-        // if we have enough values the operation was a success
-        if (filtered_tokens.size() >= min_keep) {
-            memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
-            candidates->size = filtered_tokens.size();
-            min_p_applied = true;
-        }
-    }
-
-    // if the candidates are sorted or the unsorted implementation failed, use this implementation
-    if (!min_p_applied) {
-        // Sort the logits in descending order
-        if (!candidates->sorted) {
-            std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
-                return a.logit > b.logit;
-            });
-            candidates->sorted = true;
-        }
-
-        const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
-        size_t i = 1; // first token always matches
-
-        for (; i < candidates->size; ++i) {
-            if (candidates->data[i].logit < min_logit && i >= min_keep) {
-                break; // prob too small
-            }
-        }
-
-        // Resize the output vector to keep only the matching tokens
-        candidates->size = i;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
-    if (z >= 1.0f || candidates->size <= 2) {
-        return;
-    }
-
-    llama_sample_softmax(nullptr, candidates);
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Compute the first and second derivatives
-    std::vector<float> first_derivatives(candidates->size - 1);
-    std::vector<float> second_derivatives(candidates->size - 2);
-
-    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
-    }
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
-    }
-
-    // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = std::abs(second_derivatives[i]);
-    }
-
-    // Normalize the second derivatives
-    {
-        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-
-        if (second_derivatives_sum > 1e-6f) {
-            for (float & value : second_derivatives) {
-                value /= second_derivatives_sum;
-            }
-        } else {
-            for (float & value : second_derivatives) {
-                value = 1.0f / second_derivatives.size();
-            }
-        }
-    }
-
-    float cum_sum = 0.0f;
-    size_t last_idx = candidates->size;
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        cum_sum += second_derivatives[i];
-
-        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
-        if (cum_sum > z && i >= min_keep) {
-            last_idx = i;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the tokens above the tail location
-    candidates->size = last_idx;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
-    // Reference implementation:
-    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
-    if (p >= 1.0f) {
-        return;
-    }
-
-    // Compute the softmax of logits and calculate entropy
-    llama_sample_softmax(nullptr, candidates);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    float entropy = 0.0f;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        entropy += -candidates->data[i].p * logf(candidates->data[i].p);
-    }
-
-    // Compute the absolute difference between negative log probability and entropy for each candidate
-    std::vector<float> shifted_scores;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
-        shifted_scores.push_back(shifted_score);
-    }
-
-    // Sort tokens based on the shifted_scores and their corresponding indices
-    std::vector<size_t> indices(candidates->size);
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
-        return shifted_scores[a] < shifted_scores[b];
-    });
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = indices.size();
-
-    for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        cum_sum += candidates->data[idx].p;
-
-        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > p && i >= min_keep - 1) {
-            last_idx = i + 1;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the locally typical tokens
-    std::vector<llama_token_data> new_candidates;
-    for (size_t i = 0; i < last_idx; ++i) {
-        size_t idx = indices[i];
-        new_candidates.push_back(candidates->data[idx]);
-    }
-
-    // Replace the data in candidates with the new_candidates data
-    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
-    candidates->size = new_candidates.size();
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // no need to do anything if there is only one (or zero) candidates
-    if(candidates_p->size <= 1) {
-        return;
-    }
-
-    // Calculate maximum possible entropy
-    float max_entropy = -logf(1.0f / candidates_p->size);
-
-    llama_sample_softmax(nullptr, candidates_p);
-
-    // Calculate entropy of the softmax probabilities
-    float entropy = 0.0f;
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        float prob = candidates_p->data[i].p;
-        if (prob > 0.0f) { // Ensure no log(0)
-            entropy -= prob * logf(prob);
-        }
-    }
-
-    // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above)
-    float normalized_entropy = entropy / max_entropy;
-
-    // Map the normalized entropy to the desired temperature range using the power function
-    float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val);
-
-#ifdef DEBUG
-    LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp);
-    LLAMA_LOG_INFO("Entropy: %f\n", entropy);
-    LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy);
-    LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy);
-    LLAMA_LOG_INFO("Exponent: %f\n", exponent_val);
-    LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp);
-#endif
-
-    // Apply the dynamically calculated temperature scaling
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        candidates_p->data[i].logit /= dyn_temp;
-    }
-
-    // Re-compute softmax probabilities after scaling logits with dynamic temperature
-    double max_l_double = candidates_p->data[0].logit;
-    double cum_sum_double = 0.0;
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        double p = exp(candidates_p->data[i].logit - max_l_double);
-        candidates_p->data[i].p = p; // Store the scaled probability
-        cum_sum_double += p;
-    }
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities
-    }
-
-#ifdef DEBUG
-    // Print the updated top 25 probabilities after temperature scaling
-    LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n");
-    for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) {
-        LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f);
-    }
-#endif
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        candidates_p->data[i].logit /= temp;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_repetition_penalties(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-               const llama_token * last_tokens,
-                          size_t   penalty_last_n,
-                           float   penalty_repeat,
-                           float   penalty_freq,
-                           float   penalty_present) {
-    if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Create a frequency map to count occurrences of each token in last_tokens
-    std::unordered_map<llama_token, int> token_count;
-    for (size_t i = 0; i < penalty_last_n; ++i) {
-        token_count[last_tokens[i]]++;
-    }
-
-    // Apply frequency and presence penalties to the candidates
-    for (size_t i = 0; i < candidates->size; ++i) {
-        const auto token_iter = token_count.find(candidates->data[i].id);
-        if (token_iter == token_count.end()) {
-            continue;
-        }
-
-        const int count = token_iter->second;
-
-        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
-        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (candidates->data[i].logit <= 0) {
-            candidates->data[i].logit *= penalty_repeat;
-        } else {
-            candidates->data[i].logit /= penalty_repeat;
-        }
-
-        candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
-    }
-
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
-    GGML_ASSERT(ctx);
-    int64_t t_start_sample_us = ggml_time_us();
-
-    bool allow_eog = false;
-    for (const auto & stack : grammar->stacks) {
-        if (stack.empty()) {
-            allow_eog = true;
-            break;
-        }
-    }
-
-    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
-    candidates_decoded.reserve(candidates->size);
-
-    std::vector<llama_grammar_candidate> candidates_grammar;
-    candidates_grammar.reserve(candidates->size);
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        const llama_token id      = candidates->data[i].id;
-        const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
-
-        if (llama_token_is_eog(&ctx->model, id)) {
-            if (!allow_eog) {
-                candidates->data[i].logit = -INFINITY;
-            }
-        } else if (piece.empty() || piece[0] == 0) {
-            candidates->data[i].logit = -INFINITY;
-        } else {
-            candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
-            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
-        }
-    }
-
-    const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
-    for (const auto & reject : rejects) {
-        candidates->data[reject.index].logit = -INFINITY;
-    }
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
-static void llama_log_softmax(float * array, size_t size) {
-    float max_l = *std::max_element(array, array + size);
-    float sum = 0.f;
-    for (size_t i = 0; i < size; ++i) {
-        float p = expf(array[i] - max_l);
-        sum += p;
-        array[i] = p;
-    }
-
-    for (size_t i = 0; i < size; ++i) {
-        array[i] = logf(array[i] / sum);
-    }
-}
-
-void llama_sample_apply_guidance(
-          struct llama_context * ctx,
-                         float * logits,
-                         float * logits_guidance,
-                         float   scale) {
-    GGML_ASSERT(ctx);
-
-    const auto t_start_sample_us = ggml_time_us();
-    const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-    llama_log_softmax(logits, n_vocab);
-    llama_log_softmax(logits_guidance, n_vocab);
-
-    for (int i = 0; i < n_vocab; ++i) {
-              auto & l = logits[i];
-        const auto & g = logits_guidance[i];
-
-        l = scale * (l - g) + g;
-    }
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
-llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
-    GGML_ASSERT(ctx);
-
-    auto N = float(llama_n_vocab(llama_get_model(ctx)));
-    int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
-
-    llama_sample_softmax(nullptr, candidates);
-
-    // Estimate s_hat using the most probable m tokens
-    float s_hat = 0.0;
-    float sum_ti_bi = 0.0;
-    float sum_ti_sq = 0.0;
-    for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
-        float t_i = logf(float(i + 2) / float(i + 1));
-        float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
-        sum_ti_bi += t_i * b_i;
-        sum_ti_sq += t_i * t_i;
-    }
-    s_hat = sum_ti_bi / sum_ti_sq;
-
-    // Compute k from the estimated s_hat and target surprise value
-    float epsilon_hat = s_hat - 1;
-    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
-
-    // Sample the next word X using top-k sampling
-    llama_sample_top_k(nullptr, candidates, int(k), 1);
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    llama_token X = llama_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
-
-    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
-        return candidate.id == X;
-    }));
-    float observed_surprise = -log2f(candidates->data[X_idx].p);
-    float e = observed_surprise - tau;
-
-    // Update mu using the learning rate and error
-    *mu = *mu - eta * e;
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    return X;
-}
-
-llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
-    int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
-
-    llama_sample_softmax(ctx, candidates);
-
-    // Truncate the words with surprise values greater than mu
-    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
-        return -log2f(candidate.p) > *mu;
-    }));
-
-    if (candidates->size == 0) {
-        candidates->size = 1;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-
-    // Normalize the probabilities of the remaining words
-    llama_sample_softmax(ctx, candidates);
-
-    // Sample the next word X from the remaining words
-    llama_token X = llama_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
-
-    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
-        return candidate.id == X;
-    }));
-    float observed_surprise = -log2f(candidates->data[X_idx].p);
-    float e = observed_surprise - tau;
-
-    // Update mu using the learning rate and error
-    *mu = *mu - eta * e;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-    return X;
-}
-
-llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Find max element
-    auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
-        return a.logit < b.logit;
-    });
-
-    llama_token result = max_iter->id;
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-        ctx->n_sample++;
-    }
-    return result;
-}
-
-llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
-    GGML_ASSERT(ctx);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-    llama_sample_softmax(nullptr, candidates);
-
-    std::vector<float> probs;
-    probs.reserve(candidates->size);
-    for (size_t i = 0; i < candidates->size; ++i) {
-        probs.push_back(candidates->data[i].p);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    llama_token result = candidates->data[idx].id;
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    ctx->n_sample++;
-    return result;
-}
-
-llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
-    return llama_sample_token_with_rng(ctx, candidates, ctx->rng);
-}
-
-void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    if (llama_token_is_eog(&ctx->model, token)) {
-        for (const auto & stack : grammar->stacks) {
-            if (stack.empty()) {
-                return;
-            }
-        }
-        GGML_ASSERT(false);
-    }
-
-    const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
-
-    // Note terminating 0 in decoded string
-    const auto   decoded     = decode_utf8(piece, grammar->partial_utf8);
-    const auto & code_points = decoded.first;
-    std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
-        grammar->stacks = tmp_new_stacks;
-    }
-    grammar->partial_utf8 = decoded.second;
-    GGML_ASSERT(!grammar->stacks.empty());
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-}
-
 //
 // quantization
 //
@@ -18275,8 +15699,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     // copy the KV pairs from the input file
     gguf_set_kv     (ctx_out, ml.meta);
-    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
-    gguf_set_val_u32(ctx_out, "general.file_type", ftype);
+    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
+    gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
+
     // Remove split metadata
     gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
     gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
@@ -18793,6 +16218,10 @@ int32_t llama_lora_adapter_remove(
     return -1;
 }
 
+void llama_lora_adapter_clear(struct llama_context * ctx) {
+    ctx->lora_adapters.clear();
+}
+
 void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
     delete adapter;
 }
@@ -19119,8 +16548,8 @@ struct llama_context * llama_new_context_with_model(
     ctx->abort_callback      = params.abort_callback;
     ctx->abort_callback_data = params.abort_callback_data;
 
-    ctx->rng                 = std::mt19937(params.seed);
-    ctx->logits_all          = params.logits_all;
+    ctx->sampling.rng = std::mt19937(params.seed);
+    ctx->logits_all   = params.logits_all;
 
     uint32_t kv_size = cparams.n_ctx;
     ggml_type type_k = params.type_k;
@@ -19212,9 +16641,7 @@ struct llama_context * llama_new_context_with_model(
             for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
                 ggml_backend_t backend = ggml_backend_sycl_init(i);
                 if (backend == nullptr) {
-                    int id_list[GGML_SYCL_MAX_DEVICES];
-                    ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
                     llama_free(ctx);
                     return nullptr;
                 }
@@ -19396,10 +16823,14 @@ void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
 
-const llama_model * llama_get_model(const struct llama_context * ctx) {
+const struct llama_model * llama_get_model(const struct llama_context * ctx) {
     return &ctx->model;
 }
 
+const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
+    return &ctx->model.vocab;
+}
+
 uint32_t llama_n_ctx(const struct llama_context * ctx) {
     return ctx->cparams.n_ctx;
 }
@@ -19439,7 +16870,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
-        case LLM_ARCH_CODESHELL:
         case LLM_ARCH_ORION:
         case LLM_ARCH_INTERNLM2:
         case LLM_ARCH_MINICPM:
@@ -19469,6 +16899,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_STARCODER2:
         case LLM_ARCH_OPENELM:
         case LLM_ARCH_GPTNEOX:
+        case LLM_ARCH_CODESHELL:
             return LLAMA_ROPE_TYPE_NEOX;
 
         // all model arches should be listed explicitly here
@@ -19988,7 +17419,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
     // copy rng
     {
         std::ostringstream rng_ss;
-        rng_ss << ctx->rng;
+        rng_ss << ctx->sampling.rng;
 
         const std::string & rng_str  = rng_ss.str();
         const size_t        rng_size = rng_str.size();
@@ -20154,7 +17585,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
         std::string rng_str((const char *)inp, rng_size); inp += rng_size;
 
         std::istringstream rng_ss(rng_str);
-        rng_ss >> ctx->rng;
+        rng_ss >> ctx->sampling.rng;
 
         GGML_ASSERT(!rng_ss.fail());
     }
@@ -21100,80 +18531,82 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
     return it->second.data();
 }
 
+//
+// vocab
+//
+
 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
-    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return model->vocab.id_to_token[token].text.c_str();
+    return llama_token_get_text_impl(model->vocab, token);
 }
 
 float llama_token_get_score(const struct llama_model * model, llama_token token) {
-    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return model->vocab.id_to_token[token].score;
+    return llama_token_get_score_impl(model->vocab, token);
 }
 
-llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
-    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
-    return model->vocab.id_to_token[token].attr;
+enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
+    return llama_token_get_attr_impl(model->vocab, token);
 }
 
 bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
-    return token != -1 && (
-        token == llama_token_eos(model) ||
-        token == llama_token_eot(model)
-    );
+    return llama_token_is_eog_impl(model->vocab, token);
 }
 
 bool llama_token_is_control(const struct llama_model * model, llama_token token) {
-    return llama_is_control_token(model->vocab, token);
+    return llama_token_is_control_impl(model->vocab, token);
 }
 
 llama_token llama_token_bos(const struct llama_model * model) {
-    return model->vocab.special_bos_id;
+    return llama_token_bos_impl(model->vocab);
 }
 
 llama_token llama_token_eos(const struct llama_model * model) {
-    return model->vocab.special_eos_id;
+    return llama_token_eos_impl(model->vocab);
 }
 
 llama_token llama_token_cls(const struct llama_model * model) {
-    return model->vocab.special_cls_id;
+    return llama_token_cls_impl(model->vocab);
 }
 
 llama_token llama_token_sep(const struct llama_model * model) {
-    return model->vocab.special_sep_id;
+    return llama_token_sep_impl(model->vocab);
 }
 
-llama_token llama_token_nl(const struct llama_model * model) {
-    return model->vocab.linefeed_id;
-}
-
-int32_t llama_add_bos_token(const struct llama_model * model) {
-    return model->vocab.tokenizer_add_bos;
-}
-
-int32_t llama_add_eos_token(const struct llama_model * model) {
-    return model->vocab.tokenizer_add_eos;
-}
-
-llama_token llama_token_prefix(const struct llama_model * model) {
-    return model->vocab.special_prefix_id;
-}
-
-llama_token llama_token_middle(const struct llama_model * model) {
-    return model->vocab.special_middle_id;
-}
-
-llama_token llama_token_suffix(const struct llama_model * model) {
-    return model->vocab.special_suffix_id;
-}
-
-llama_token llama_token_eot(const struct llama_model * model) {
-    return model->vocab.special_eot_id;
+llama_token llama_token_nl (const struct llama_model * model) {
+    return llama_token_nl_impl(model->vocab);
 }
 
 llama_token llama_token_pad(const struct llama_model * model) {
-    return model->vocab.special_pad_id;
+    return llama_token_pad_impl(model->vocab);
 }
 
+int32_t llama_add_bos_token(const struct llama_model * model) {
+    return llama_add_bos_token_impl(model->vocab);
+}
+
+int32_t llama_add_eos_token(const struct llama_model * model) {
+    return llama_add_eos_token_impl(model->vocab);
+}
+
+llama_token llama_token_prefix(const struct llama_model * model) {
+    return llama_token_prefix_impl(model->vocab);
+}
+
+llama_token llama_token_middle(const struct llama_model * model) {
+    return llama_token_middle_impl(model->vocab);
+}
+
+llama_token llama_token_suffix(const struct llama_model * model) {
+    return llama_token_suffix_impl(model->vocab);
+}
+
+llama_token llama_token_eot(const struct llama_model * model) {
+    return llama_token_eot_impl(model->vocab);
+}
+
+//
+// tokenization
+//
+
 int32_t llama_tokenize(
     const struct llama_model * model,
                   const char * text,
@@ -21182,229 +18615,33 @@ int32_t llama_tokenize(
                      int32_t   n_tokens_max,
                         bool   add_special,
                         bool   parse_special) {
-    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
-    if (n_tokens_max < (int) res.size()) {
-        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
-        return -((int) res.size());
-    }
-
-    for (size_t i = 0; i < res.size(); i++) {
-        tokens[i] = res[i];
-    }
-
-    return res.size();
+    return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
 }
 
-static std::string llama_decode_text(const std::string & text) {
-    std::string decoded_text;
-
-    const auto cpts = unicode_cpts_from_utf8(text);
-    for (const auto cpt : cpts) {
-        const auto utf8 = unicode_cpt_to_utf8(cpt);
-        try {
-            decoded_text += unicode_utf8_to_byte(utf8);
-        } catch (const std::out_of_range & /*e*/) {
-            decoded_text += "[UNK_BYTE_0x";
-            for (const auto c : utf8) {
-                decoded_text += format("%02x", (uint8_t) c);
-            }
-            decoded_text += text + "]";
-        }
-    }
-
-    return decoded_text;
-}
-
-// does not write null-terminator to buf
-int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
-    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
-    static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
-    const llama_token_attr attr = llama_token_get_attr(model, token);
-    if (!special && (attr & attr_special)) {
-        return 0;
-    }
-
-    // copy piece chars to output text buffer
-    // skip up to 'lstrip' leading spaces before copying
-    auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
-        for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
-            token++;
-            size--;
-        }
-        if (length < (int32_t)size) {
-            return -(int32_t) size;
-        }
-        memcpy(buf, token, size);
-        return (int32_t) size;
-    };
-
-    // if we have a cache - use it
-    {
-        const auto & cache = model->vocab.cache_token_to_piece;
-
-        if (!cache.empty()) {
-            const auto & result = cache.at(token);
-            return _try_copy(result.data(), result.size());
-        }
-    }
-
-    if (0 <= token && token < llama_n_vocab(model)) {
-        const std::string & token_text = model->vocab.id_to_token[token].text;
-        switch (llama_vocab_get_type(model->vocab)) {
-            case LLAMA_VOCAB_TYPE_WPM:
-            case LLAMA_VOCAB_TYPE_SPM:
-            case LLAMA_VOCAB_TYPE_UGM: {
-                // NOTE: we accept all unsupported token types,
-                // suppressing them like CONTROL tokens.
-                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
-                    return _try_copy(token_text.data(), token_text.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
-                    std::string result = token_text;
-                    llama_unescape_whitespace(result);
-                    return _try_copy(result.data(), result.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_BYTE) {
-                    char byte = (char) llama_token_to_byte(model->vocab, token);
-                    return _try_copy((char*) &byte, 1);
-                }
-                break;
-            }
-            case LLAMA_VOCAB_TYPE_BPE: {
-                // NOTE: we accept all unsupported token types,
-                // suppressing them like CONTROL tokens.
-                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
-                    return _try_copy(token_text.data(), token_text.size());
-                } else if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
-                    std::string result = llama_decode_text(token_text);
-                    return _try_copy(result.data(), result.size());
-                }
-                break;
-            }
-            default:
-                GGML_ASSERT(false);
-        }
-    }
-    return 0;
+int32_t llama_token_to_piece(
+    const struct llama_model * model,
+                 llama_token   token,
+                        char * buf,
+                     int32_t   length,
+                     int32_t   lstrip,
+                        bool   special) {
+    return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
 }
 
 int32_t llama_detokenize(
-        const struct llama_model * model,
-               const llama_token * tokens,
-                         int32_t   n_tokens,
-                            char * text,
-                         int32_t   text_len_max,
-                            bool   remove_special,
-                            bool   unparse_special) {
-    int32_t avail = text_len_max;
-    int32_t total = 0;
-
-    // remove the leading space
-    bool remove_space = model->vocab.tokenizer_add_space_prefix;
-
-    if (remove_special && model->vocab.tokenizer_add_bos) {
-        if (n_tokens > 0 && tokens[0] == model->vocab.special_bos_id) {
-            remove_space = false;
-            n_tokens--;
-            tokens++;
-        }
-    }
-
-    if (remove_special && model->vocab.tokenizer_add_eos) {
-        if (n_tokens > 0 && tokens[n_tokens-1] == model->vocab.special_eos_id) {
-            n_tokens--;
-        }
-    }
-
-    for (int32_t i = 0; i < n_tokens; ++i) {
-        GGML_ASSERT(avail >= 0);
-        int32_t n_chars = llama_token_to_piece(model, tokens[i], text, avail, remove_space, unparse_special);
-        remove_space = false;
-        if (n_chars < 0) {
-            avail = 0;
-            total -= n_chars;
-        } else if (n_chars > 0) {
-            avail -= n_chars;
-            text  += n_chars;
-            total += n_chars;
-        }
-    }
-
-    if (total > text_len_max) {
-        return -total;
-    }
-
-    if (model->vocab.tokenizer_clean_spaces) {
-        text -= total;  // restart text
-
-        // first pass: characters ?!.,  //TODO: where do these characters come from?
-        const int32_t total1 = total;
-        total = total ? 1 : 0;
-        for (int32_t i = 1; i < total1; ++i) {
-            const char x = text[i];
-            if (text[i - 1] == ' ') {
-                if (x == '?' || x == '!' || x == '.' || x == ',') {  // " ?", " !", " .", " ,"
-                    total--;  // remove space
-                }
-            }
-            text[total++] = x;
-        }
-
-        // second pass: strip single apostrophe between spaces
-        const int32_t total2 = total;
-        total = total ? 1 : 0;
-        for (int32_t i = 1; i < total2; ++i) {
-            const char x = text[i];
-            if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') {  // " ' "
-                total--;           // remove prev space
-                text[++i] = '\0';  // remove next space
-            }
-            text[total++] = x;
-        }
-
-        // third pass: apostrophe contractions  //NOTE: this makes sense?
-        const int32_t total3 = total;
-        total = total ? 1 : 0;
-        for (int32_t i = 1; i < total3; ++i) {
-            const char x = text[i];
-            if (text[i - 1] == ' ') {
-                if (x == '\'' && i + 1 < total3) {
-                    const char x1 = text[i + 1];
-                    if (x1 == 't' || x1 == 'd') {  // " 't", " 'd"
-                        //total--;  // remove space
-                    } else if (x1 == 's' || x1 == 'm') {  // " 's", " 'm"
-                        total--;  // remove space
-                    } else if (i + 2 < total3) {
-                        const char x2 = text[i + 2];
-                        if ((x1 == 'l' && x2 == 'l')) {  // " 'll"
-                            //total--;  // remove space
-                        } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) {  // " 're", " 've"
-                            total--;  // remove space
-                        } else {
-                            //total--;  // remove space
-                        }
-                    } else {
-                        //total--;  // remove space
-                    }
-                }
-            }
-            text[total++] = x;
-        }
-    }
-
-    return total <= text_len_max ? total : -total;
+    const struct llama_model * model,
+           const llama_token * tokens,
+                     int32_t   n_tokens,
+                        char * text,
+                     int32_t   text_len_max,
+                        bool   remove_special,
+                        bool   unparse_special) {
+    return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
 }
 
-// trim whitespace from the beginning and end of a string
-static std::string trim(const std::string & str) {
-    size_t start = 0;
-    size_t end = str.size();
-    while (start < end && isspace(str[start])) {
-        start += 1;
-    }
-    while (end > start && isspace(str[end - 1])) {
-        end -= 1;
-    }
-    return str.substr(start, end - start);
-}
+//
+// chat templates
+//
 
 // Simple version of "llama_apply_chat_template" that only works with strings
 // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
@@ -21655,7 +18892,7 @@ static int32_t llama_chat_apply_template_internal(
     return dest.size();
 }
 
-LLAMA_API int32_t llama_chat_apply_template(
+int32_t llama_chat_apply_template(
                 const struct llama_model * model,
                               const char * tmpl,
          const struct llama_chat_message * chat,
@@ -21696,7 +18933,126 @@ LLAMA_API int32_t llama_chat_apply_template(
     return res;
 }
 
-LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
+//
+// grammar
+//
+
+struct llama_grammar * llama_grammar_init(
+        const llama_grammar_element ** rules,
+        size_t    n_rules,
+        size_t    start_rule_index) {
+    return llama_grammar_init_impl(rules, n_rules, start_rule_index);
+}
+
+void llama_grammar_free(struct llama_grammar * grammar) {
+    llama_grammar_free_impl(grammar);
+}
+
+struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
+    return llama_grammar_copy_impl(grammar);
+}
+
+void llama_grammar_sample(
+      const struct llama_grammar * grammar,
+      const struct llama_context * ctx,
+          llama_token_data_array * candidates) {
+    llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
+}
+
+void llama_sample_grammar(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+      const struct llama_grammar * grammar) {
+    llama_grammar_sample(grammar, ctx, candidates);
+}
+
+void llama_grammar_accept_token(
+            struct llama_grammar * grammar,
+            struct llama_context * ctx,
+                     llama_token   token) {
+    llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
+}
+
+//
+// sampling
+//
+
+void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
+    llama_set_rng_seed_impl(&ctx->sampling, seed);
+}
+
+void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
+    llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
+}
+
+void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
+    llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
+}
+
+void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
+}
+
+void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
+}
+
+void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
+    llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
+}
+
+void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
+}
+
+void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
+    llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
+}
+
+void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
+    llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
+}
+
+void llama_sample_repetition_penalties(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   penalty_last_n,
+                           float   penalty_repeat,
+                           float   penalty_freq,
+                           float   penalty_present) {
+    llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
+}
+
+void llama_sample_apply_guidance(
+          struct llama_context * ctx,
+                         float * logits,
+                         float * logits_guidance,
+                         float   scale) {
+    llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
+}
+
+llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
+    return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
+}
+
+llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
+    return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
+}
+
+llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
+    return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
+}
+
+llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
+    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
+}
+
+llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
+    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
+}
+
+int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
     static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
     if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
         return strlen(split_path);
@@ -21725,11 +19081,11 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
         /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
         /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
         /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
-        /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
+        /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
         /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
         /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
 
-        /*.n_sample =*/ std::max(1, ctx->n_sample),
+        /*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
         /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
         /*.n_eval   =*/ std::max(1, ctx->n_eval),
     };
@@ -21752,10 +19108,11 @@ void llama_print_timings(struct llama_context * ctx) {
 }
 
 void llama_reset_timings(struct llama_context * ctx) {
-    ctx->t_start_us = ggml_time_us();
-    ctx->t_sample_us = ctx->n_sample = 0;
+    ctx->t_start_us  = ggml_time_us();
     ctx->t_eval_us   = ctx->n_eval   = 0;
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
+
+    ctx->sampling.reset_timings();
 }
 
 const char * llama_print_system_info(void) {
@@ -21781,11 +19138,7 @@ const char * llama_print_system_info(void) {
     s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
     s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
-#ifdef GGML_USE_LLAMAFILE
-    s += "LLAMAFILE = 1 | ";
-#else
-    s += "LLAMAFILE = 0 | ";
-#endif
+    s += "LLAMAFILE = "   + std::to_string(ggml_cpu_has_llamafile())   + " | ";
 
     return s.c_str();
 }
@@ -21802,20 +19155,20 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
     fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
             1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
     fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
-            1.0e-3 * ctx->t_sample_us / ctx->n_sample);
+            1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
     fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
     fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->n_sample);
+    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->sampling.n_sample);
     fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
     fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
     fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->t_sample_us);
+    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
     fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
             1.0e6 * ctx->n_eval / ctx->t_eval_us);
     fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
             1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
     fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
-            1.0e6 * ctx->n_sample / ctx->t_sample_us);
+            1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
 }
 
 // For internal test use
@@ -21854,14 +19207,14 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
     va_end(args_copy);
 }
 
-static void llama_log_internal(ggml_log_level level, const char * format, ...) {
+void llama_log_internal(ggml_log_level level, const char * format, ...) {
     va_list args;
     va_start(args, format);
     llama_log_internal_v(level, format, args);
     va_end(args);
 }
 
-static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
     (void) level;
     (void) user_data;
     fputs(text, stderr);
diff --git a/src/unicode.cpp b/src/unicode.cpp
index e05fb9d17..46650bff0 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -19,6 +19,12 @@
 #include <locale>
 #include <codecvt>
 
+size_t unicode_len_utf8(char src) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
     std::string result;
     for (size_t i = 0; i < cps.size(); ++i) {
diff --git a/src/unicode.h b/src/unicode.h
index 30b07ba7f..008532a24 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -4,6 +4,8 @@
 #include <string>
 #include <vector>
 
+// TODO: prefix all symbols with "llama_"
+
 struct codepoint_flags {
     enum {
         UNDEFINED       = 0x0001,
@@ -46,6 +48,7 @@ struct codepoint_flags {
     }
 };
 
+size_t unicode_len_utf8(char src);
 
 std::string unicode_cpt_to_utf8(uint32_t cp);
 uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cfa707315..0207e3a59 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -70,21 +70,19 @@ add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
 target_link_libraries(test-tokenizer-0 PRIVATE common)
 install(TARGETS test-tokenizer-0 RUNTIME)
 
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
-# TODO: enable when fixed
-#       https://github.com/ggerganov/llama.cpp/pull/7036
-#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
-#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
 # build test-tokenizer-1-bpe target once and add many tests
 add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
@@ -92,16 +90,14 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
 install(TARGETS test-tokenizer-1-bpe RUNTIME)
 
 # TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
 
 # build test-tokenizer-1-spm target once and add many tests
 add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index 6583dd0b2..a8222caee 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -1,4 +1,3 @@
-#include <iostream>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -133,13 +132,31 @@ int main(void) {
         );
         formatted_chat.resize(res);
         std::string output(formatted_chat.data(), formatted_chat.size());
-        std::cout << output << "\n-------------------------\n";
+        printf("%s\n", output.c_str());
+        printf("-------------------------\n");
         assert(output == expected);
     }
 
-    // test llama_chat_format_single
-    std::cout << "\n\n=== llama_chat_format_single ===\n\n";
+
+    // test llama_chat_format_single for system message
+    printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
     std::vector<llama_chat_msg> chat2;
+    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
+
+    auto fmt_sys = [&](std::string tmpl) {
+        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
+        printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
+        printf("-------------------------\n");
+        return output;
+    };
+    assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
+    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
+    assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
+    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
+
+
+    // test llama_chat_format_single for user message
+    printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
     chat2.push_back({"system", "You are a helpful assistant"});
     chat2.push_back({"user", "Hello"});
     chat2.push_back({"assistant", "I am assistant"});
@@ -147,12 +164,13 @@ int main(void) {
 
     auto fmt_single = [&](std::string tmpl) {
         auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
-        std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
+        printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
+        printf("-------------------------\n");
         return output;
     };
     assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
     assert(fmt_single("llama2") == "[INST] How are you [/INST]");
-    assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
+    assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
     assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
 
     return 0;
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 975658f79..68f971bfe 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -44,21 +44,26 @@ static bool test_build_grammar_fails(const std::string & grammar_str) {
     return grammar_fails;
 }
 
-static bool match_string(const std::string & input, llama_grammar* grammar) {
+static bool match_string(const std::string & input, llama_grammar * grammar) {
     auto decoded = decode_utf8(input, {});
 
     const auto & code_points = decoded.first;
 
+    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
+          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
+
     for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        auto prev_stacks = grammar->stacks;
-        llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
-        if (grammar->stacks.empty()) {
+        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
+
+        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
+
+        if (cur_stacks.empty()) {
             // no stacks means that the grammar failed to match at this point
             return false;
         }
     }
 
-    for (const auto & stack : grammar->stacks) {
+    for (const auto & stack : cur_stacks) {
         if (stack.empty()) {
             // An empty stack means that the grammar has been completed
             return true;
@@ -75,7 +80,9 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
     auto grammar = build_grammar(grammar_str);
 
     // Save the original grammar stacks so that we can reset after every new string we want to test
-    auto original_stacks = grammar->stacks;
+    const llama_grammar_stacks original_stacks = llama_grammar_get_stacks(grammar);
+
+    llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
 
     fprintf(stderr, "  🔵 Valid strings:\n");
 
@@ -112,7 +119,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
         assert(matched);
 
         // Reset the grammar stacks
-        grammar->stacks = original_stacks;
+        cur_stacks = original_stacks;
     }
 
     fprintf(stderr, "  🟠 Invalid strings:\n");
@@ -132,7 +139,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
         assert(!matched);
 
         // Reset the grammar stacks
-        grammar->stacks = original_stacks;
+        cur_stacks = original_stacks;
     }
 
     // Clean up allocated memory
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index c8badb206..1f3a267b3 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -2,10 +2,12 @@
 #undef NDEBUG
 #endif
 
-#include "llama.cpp" // TODO: not great
+#define LLAMA_API_INTERNAL
+#include "llama.h"
 #include "grammar-parser.h"
 
 #include <cassert>
+#include <stdexcept>
 
 int main()
 {
@@ -112,10 +114,10 @@ int main()
         }
     }
 
-    llama_grammar *grammar = NULL;
+    llama_grammar * grammar = NULL;
     std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-    grammar = llama_grammar_init(
-        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+
+    grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
     if (grammar == nullptr)
     {
         throw std::runtime_error("Failed to initialize llama_grammar");
@@ -172,7 +174,7 @@ int main()
         }};
 
     auto index = 0;
-    for (auto stack : grammar->stacks)
+    for (auto stack : llama_grammar_get_stacks(grammar))
     {
         // compare stack to expected_stack
         for (uint32_t i = 0; i < stack.size(); i++)
@@ -374,13 +376,13 @@ int main()
         },
     };
 
-    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
+    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
 
     std::vector<std::vector<llama_grammar_candidate>> all_rejects;
 
-    for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
+    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
     {
-        rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
+        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
         all_rejects.push_back(rejects);
     }
 
@@ -401,6 +403,6 @@ int main()
         delete[] candidate.code_points;
         candidate.code_points = nullptr;
     }
-    delete grammar;
+    llama_grammar_free(grammar);
     return 0;
 }