Merge remote-tracking branch 'upstream/master' into cancel-model-load

2023-12-20 00:15:09 -05:00 · 2023-12-20 00:15:09 -05:00 · ba46057b11
commit ba46057b11
parent ca122dc9e0 328b83de23
32 changed files with 1956 additions and 882 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -23,3 +23,6 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -291,7 +291,12 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
        if (LLAMA_STATIC)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            if (WIN32)
                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
            endif()
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()
--- a/12
+++ b/12
@ -441,9 +441,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 endif # LLAMA_CLBLAST
 ifdef LLAMA_HIPBLAS
-	ROCM_PATH	?= /opt/rocm
+
-	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	ifeq ($(wildcard /opt/rocm),)
-	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		ROCM_PATH	?= /usr
 		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
 		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	HIPCC                   ?= $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
--- a/README.md
+++ b/README.md
@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
 - Collecting Apple Silicon performance stats:
  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
 - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
 - **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
 - Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
 ----
--- a/common/train.cpp
+++ b/common/train.cpp
@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
    float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            scale /= sqrtf((float) tensor->ne[0]);
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
 }
 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
 }
 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == 1);
    GGML_ASSERT(tensor->ne[2] == 1);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == 1);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
    bool                   sample_random_offsets
 ) {
    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(target_probs->n_dims  == 3);
+    GGML_ASSERT(ggml_is_3d(target_probs));
    int64_t n_vocab  = target_probs->ne[0];
    int64_t n_tokens = tokens_input->ne[0];
    int64_t n_batch  = tokens_input->ne[1];
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -182,6 +182,8 @@ class Model:
            return QwenModel
        if model_architecture == "MixtralForCausalLM":
            return MixtralModel
        if model_architecture == "PhiForCausalLM":
            return Phi2Model
        return Model
    def _is_model_safetensors(self) -> bool:
@ -221,6 +223,8 @@ class Model:
            return gguf.MODEL_ARCH.QWEN
        if arch == "MixtralForCausalLM":
            return gguf.MODEL_ARCH.LLAMA
        if arch == "PhiForCausalLM":
            return gguf.MODEL_ARCH.PHI2
        raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -980,6 +984,24 @@ class QwenModel(Model):
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
 class Phi2Model(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]
        self.gguf_writer.add_name("Phi2")
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_add_bos_token(False)
 ###### CONVERSION LOGIC ######
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -3,7 +3,6 @@ from __future__ import annotations
 import json
 import os
 import re
 import struct
 import sys
 from typing import Any, BinaryIO, Sequence
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
 HF_SUBLAYER_TO_GGML = {
    "self_attn.q_proj": "attn_q",
    "self_attn.k_proj": "attn_k",
    "self_attn.v_proj": "attn_v",
    "self_attn.o_proj": "attn_output",
    "mlp.gate_proj": "ffn_gate",
    "mlp.down_proj": "ffn_down",
    "mlp.up_proj": "ffn_up",
    "input_layernorm": "attn_norm",
    "post_attention_layernorm": "ffn_norm",
 }
 def translate_tensor_name(t: str) -> str:
    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
    if match:
        nn = match.group(1)
        sub_layer = match.group(2)
        lora_type = match.group(3)
        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
        if sub_layer_renamed is None:
            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
            sys.exit(1)
        output_string = (
            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
        )
        return output_string
    else:
        print(f"Error: unrecognized tensor {t}")
        sys.exit(1)
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))
-def write_tensor_header(
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@ -78,11 +47,12 @@ def write_tensor_header(
    fout.seek((fout.tell() + 31) & -32)
-if len(sys.argv) != 2:
+if len(sys.argv) < 2:
-    print(f"Usage: python {sys.argv[0]} <path>")
+    print(f"Usage: python {sys.argv[0]} <path> [arch]")
    print(
        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
    )
    print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
    sys.exit(1)
 input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
 output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
 model = torch.load(input_model, map_location="cpu")
 arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
 if arch_name not in gguf.MODEL_ARCH_NAMES.values():
    print(f"Error: unsupported architecture {arch_name}")
    sys.exit(1)
 arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
 name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
 with open(input_json, "r") as f:
    params = json.load(f)
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
    write_file_header(fout, params)
    for k, v in model.items():
        orig_k = k
        if k.endswith(".default.weight"):
            k = k.replace(".default.weight", ".weight")
        if k in ["llama_proj.weight", "llama_proj.bias"]:
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
            v = v.float()
        t = v.detach().numpy()
-        tname = translate_tensor_name(k)
+
        prefix = "base_model.model."
        if k.startswith(prefix):
            k = k[len(prefix) :]
        lora_suffixes = (".lora_A.weight", ".lora_B.weight")
        if k.endswith(lora_suffixes):
            suffix = k[-len(lora_suffixes[0]):]
            k = k[: -len(lora_suffixes[0])]
        else:
            print(f"Error: unrecognized tensor name {orig_k}")
            sys.exit(1)
        tname = name_map.get_name(k)
        if tname is None:
            print(f"Error: could not map tensor name {orig_k}")
            print(" Note: the arch parameter must be specified if the model is not llama")
            sys.exit(1)
        if suffix == ".lora_A.weight":
            tname += ".weight.loraA"
        elif suffix == ".lora_B.weight":
            tname += ".weight.loraB"
        else:
            assert False
        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
        write_tensor_header(fout, tname, t.shape, t.dtype)
        t.tofile(fout)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
 }
 static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(logits->n_dims == 2);
+    assert(ggml_is_matrix(logits));
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
-    assert(best_samples->n_dims == 1);
+    assert(ggml_is_vector(best_samples));
    assert(logits->ne[1] == best_samples->ne[0]);
    assert(logits->ne[0] == probs->ne[0]);
    assert(logits->ne[1] == probs->ne[1]);
@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
    struct ggml_tensor * best_samples
 ) {
-    GGML_ASSERT(best_samples->n_dims == 2);
+    GGML_ASSERT(ggml_is_matrix(best_samples));
-    GGML_ASSERT(logits->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(logits));
-    GGML_ASSERT(probs->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(probs));
    int n_tokens = best_samples->ne[0];
    int n_batch  = best_samples->ne[1];
    int n_vocab  = logits->ne[0];
@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
 static void get_example_targets_batch(
    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
 ) {
-    GGML_ASSERT(tokens_input->n_dims == 2);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(     targets->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(targets));
    int n_tokens = tokens_input->ne[0];
    int n_batch  = tokens_input->ne[1];
    GGML_ASSERT(n_tokens == targets->ne[1]);
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
-    switch (gg_weights->n_dims){
+    switch (ggml_n_dims(gg_weights)) {
        case 1:
            ct = 0;
            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
        name = ggml_get_name(tensor);
    }
    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
+    uint32_t nd = ggml_n_dims(tensor);
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) {
    opt->params.adam.gclip              = params.common.adam_gclip;
    opt->params.adam.eps_f              = params.common.adam_eps_f;
    ggml_allocr * alloc = NULL;
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) {
    // allocate input tensors
    mem_input_data.resize(max_input_size);
-    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc_inps, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
+    ggml_allocr_alloc(alloc_inps, target_probs);
    ggml_allocr_free(alloc);
    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        alloc = ggml_allocr_new_measure(tensor_alignment);
+        ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) {
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
    gf->order = best_order;
    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) {
        params.common.use_checkpointing
    );
    ggml_allocr_free(alloc);
    ggml_allocr_free(alloc_inps);
    // tokenize data
    std::vector<llama_token> train_tokens;
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
            // print first 10 elements
            const float * data = (const float *) cur->data;
--- a/examples/llama.swiftui/.gitignore
+++ b/examples/llama.swiftui/.gitignore
@ -1 +1,2 @@
 xcuserdata
 xcshareddata
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -6,16 +6,34 @@ enum LlamaError: Error {
    case couldNotInitializeContext
 }
 func llama_batch_clear(_ batch: inout llama_batch) {
    batch.n_tokens = 0
 }
 func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
    batch.token   [Int(batch.n_tokens)] = id
    batch.pos     [Int(batch.n_tokens)] = pos
    batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
    for i in 0..<seq_ids.count {
        batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
    }
    batch.logits  [Int(batch.n_tokens)] = logits ? 1 : 0
    batch.n_tokens += 1
 }
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]
-    var n_len: Int32 = 512
+    var n_len: Int32 = 64
    var n_cur: Int32 = 0
    var n_decode: Int32 = 0
    init(model: OpaquePointer, context: OpaquePointer) {
@ -27,25 +45,34 @@ actor LlamaContext {
    }
    deinit {
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
        llama_backend_free()
    }
-    static func createContext(path: String) throws -> LlamaContext {
+    static func create_context(path: String) throws -> LlamaContext {
        llama_backend_init(false)
-        let model_params = llama_model_default_params()
+        var model_params = llama_model_default_params()
 #if targetEnvironment(simulator)
        model_params.n_gpu_layers = 0
        print("Running on simulator, force use n_gpu_layers = 0")
 #endif
        let model = llama_load_model_from_file(path, model_params)
        guard let model else {
            print("Could not load model at \(path)")
            throw LlamaError.couldNotInitializeContext
        }
        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
        print("Using \(n_threads) threads")
        var ctx_params = llama_context_default_params()
-        ctx_params.seed = 1234
+        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
-        ctx_params.n_threads = 8
+        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = 8
+        ctx_params.n_threads_batch = UInt32(n_threads)
        let context = llama_new_context_with_model(model, ctx_params)
        guard let context else {
@ -56,6 +83,26 @@ actor LlamaContext {
        return LlamaContext(model: model, context: context)
    }
    func model_info() -> String {
        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
        result.initialize(repeating: Int8(0), count: 256)
        defer {
            result.deallocate()
        }
        // TODO: this is probably very stupid way to get the string from C
        let nChars = llama_model_desc(model, result, 256)
        let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
        var SwiftString = ""
        for char in bufferPointer {
            SwiftString.append(Character(UnicodeScalar(UInt8(char))))
        }
        return SwiftString
    }
    func get_n_tokens() -> Int32 {
        return batch.n_tokens;
    }
@ -79,16 +126,11 @@ actor LlamaContext {
            print(String(cString: token_to_piece(token: id) + [0]))
        }
-        // batch = llama_batch_init(512, 0) // done in init()
+        llama_batch_clear(&batch)
        batch.n_tokens = Int32(tokens_list.count)
-        for i1 in 0..<batch.n_tokens {
+        for i1 in 0..<tokens_list.count {
            let i = Int(i1)
-            batch.token[i] = tokens_list[i]
+            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
            batch.pos[i] = i1
            batch.n_seq_id[Int(i)] = 1
            batch.seq_id[Int(i)]![0] = 0
            batch.logits[i] = 0
        }
        batch.logits[Int(batch.n_tokens) - 1] = 1 // true
@ -141,18 +183,11 @@ actor LlamaContext {
        print(new_token_str)
        // tokens_list.append(new_token_id)
-        batch.n_tokens = 0
+        llama_batch_clear(&batch)
-
+        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
        batch.token[Int(batch.n_tokens)] = new_token_id
        batch.pos[Int(batch.n_tokens)] = n_cur
        batch.n_seq_id[Int(batch.n_tokens)] = 1
        batch.seq_id[Int(batch.n_tokens)]![0] = 0
        batch.logits[Int(batch.n_tokens)] = 1 // true
        batch.n_tokens += 1
        n_decode += 1
-
+        n_cur    += 1
        n_cur += 1
        if llama_decode(context, batch) != 0 {
            print("failed to evaluate llama!")
@ -161,14 +196,111 @@ actor LlamaContext {
        return new_token_str
    }
    func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
        var pp_avg: Double = 0
        var tg_avg: Double = 0
        var pp_std: Double = 0
        var tg_std: Double = 0
        for _ in 0..<nr {
            // bench prompt processing
            llama_batch_clear(&batch)
            let n_tokens = pp
            for i in 0..<n_tokens {
                llama_batch_add(&batch, 0, Int32(i), [0], false)
            }
            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
            llama_kv_cache_clear(context)
            let t_pp_start = ggml_time_us()
            if llama_decode(context, batch) != 0 {
                print("llama_decode() failed during prompt")
            }
            let t_pp_end = ggml_time_us()
            // bench text generation
            llama_kv_cache_clear(context)
            let t_tg_start = ggml_time_us()
            for i in 0..<tg {
                llama_batch_clear(&batch)
                for j in 0..<pl {
                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
                }
                if llama_decode(context, batch) != 0 {
                    print("llama_decode() failed during text generation")
                }
            }
            let t_tg_end = ggml_time_us()
            llama_kv_cache_clear(context)
            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
            let speed_pp = Double(pp)    / t_pp
            let speed_tg = Double(pl*tg) / t_tg
            pp_avg += speed_pp
            tg_avg += speed_tg
            pp_std += speed_pp * speed_pp
            tg_std += speed_tg * speed_tg
            print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
        }
        pp_avg /= Double(nr)
        tg_avg /= Double(nr)
        if nr > 1 {
            pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
            tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
        } else {
            pp_std = 0
            tg_std = 0
        }
        let model_desc     = model_info();
        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
        let backend        = "Metal";
        let pp_avg_str     = String(format: "%.2f", pp_avg);
        let tg_avg_str     = String(format: "%.2f", tg_avg);
        let pp_std_str     = String(format: "%.2f", pp_std);
        let tg_std_str     = String(format: "%.2f", tg_std);
        var result = ""
        result += String("| model | size | params | backend | test | t/s |\n")
        result += String("| --- | --- | --- | --- | --- | --- |\n")
        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
        return result;
    }
    func clear() {
        tokens_list.removeAll()
        temporary_invalid_cchars.removeAll()
        llama_kv_cache_clear(context)
    }
    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
        let utf8Count = text.utf8.count
-        let n_tokens = utf8Count + (add_bos ? 1 : 0)
+        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@ -1,481 +1,483 @@
 // !$*UTF8*$!
 {
-    archiveVersion = 1;
+	archiveVersion = 1;
-    classes = {
+	classes = {
-    };
+	};
-    objectVersion = 56;
+	objectVersion = 56;
-    objects = {
+	objects = {
 /* Begin PBXBuildFile section */
-        542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
+		542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-        5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
+		5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-        542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
+		542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
-        542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
+		542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
-        542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
+		542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-        542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
+		542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
-        549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
+		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
-        549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
+		549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
-        8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
+		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
-        8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
+		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
-        8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
+		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
-        8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
+		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
-        8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
+		8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
-        8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
+		8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
-        8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
+		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
-        8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
 /* End PBXBuildFile section */
 /* Begin PBXFileReference section */
-        542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
+		542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
-        542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
+		542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
-        542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
+		542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
-        5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
+		5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
-        542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
+		542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
-        542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
+		542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
-        542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
+		542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
-        542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
+		542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
-        542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
+		542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
-        542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
+		542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
-        549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
+		549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
-        549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
+		549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
-        549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
+		549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
-        549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
+		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
-        8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
+		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
-        8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
-        8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
+		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
-        8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
-        8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
-        8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
+		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-        8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
-        8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
+		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-        8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
+		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
-        8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
+		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
-        8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 /* Begin PBXFrameworksBuildPhase section */
-        8A1C83702AC328BD0096AF73 /* Frameworks */ = {
+		8A1C83702AC328BD0096AF73 /* Frameworks */ = {
-            isa = PBXFrameworksBuildPhase;
+			isa = PBXFrameworksBuildPhase;
-            buildActionMask = 2147483647;
+			buildActionMask = 2147483647;
-            files = (
+			files = (
-                549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
+				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
-                8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
+				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
-            );
+			);
-            runOnlyForDeploymentPostprocessing = 0;
+			runOnlyForDeploymentPostprocessing = 0;
-        };
+		};
 /* End PBXFrameworksBuildPhase section */
 /* Begin PBXGroup section */
-        8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
+		8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
+				5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
-                542376092B0D9C40008E6A1C /* ggml-backend.h */,
+				542376092B0D9C40008E6A1C /* ggml-backend.h */,
-                542376062B0D9BEA008E6A1C /* ggml-quants.h */,
+				542376062B0D9BEA008E6A1C /* ggml-quants.h */,
-                542376072B0D9BFB008E6A1C /* ggml-quants.c */,
+				542376072B0D9BFB008E6A1C /* ggml-quants.c */,
-                549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
+				549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
-                549479C62AC9E0F200E0F78B /* ggml-metal.h */,
+				549479C62AC9E0F200E0F78B /* ggml-metal.h */,
-                549479C52AC9E0F200E0F78B /* ggml-metal.m */,
+				549479C52AC9E0F200E0F78B /* ggml-metal.m */,
-                542EA09B2AC8723900A8AEE9 /* ggml.c */,
+				542EA09B2AC8723900A8AEE9 /* ggml.c */,
-                542EA09C2AC8723900A8AEE9 /* ggml.h */,
+				542EA09C2AC8723900A8AEE9 /* ggml.h */,
-                542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
+				542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
-                542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
+				542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
-                542EA0A12AC8729100A8AEE9 /* llama.cpp */,
+				542EA0A12AC8729100A8AEE9 /* llama.cpp */,
-                542EA0A22AC8729100A8AEE9 /* llama.h */,
+				542EA0A22AC8729100A8AEE9 /* llama.h */,
-            );
+			);
-            name = llama.cpp;
+			name = llama.cpp;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C836A2AC328BD0096AF73 = {
+		8A1C836A2AC328BD0096AF73 = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
+				8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
-                8A907F312AC7134E006146EA /* llama.cpp.swift */,
+				8A907F312AC7134E006146EA /* llama.cpp.swift */,
-                8A3F84232AC4C891005E2EE8 /* models */,
+				8A3F84232AC4C891005E2EE8 /* models */,
-                8A1C83752AC328BD0096AF73 /* llama.swiftui */,
+				8A1C83752AC328BD0096AF73 /* llama.swiftui */,
-                8A1C83742AC328BD0096AF73 /* Products */,
+				8A1C83742AC328BD0096AF73 /* Products */,
-                8A39BE082AC7601000BFEB40 /* Frameworks */,
+				8A39BE082AC7601000BFEB40 /* Frameworks */,
-            );
+			);
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C83742AC328BD0096AF73 /* Products */ = {
+		8A1C83742AC328BD0096AF73 /* Products */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
+				8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
-            );
+			);
-            name = Products;
+			name = Products;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
+		8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A3F84102AC4BD85005E2EE8 /* Resources */,
+				8A3F84102AC4BD85005E2EE8 /* Resources */,
-                8A9F7C4B2AC332DC008AE1EA /* Models */,
+				8A9F7C4B2AC332DC008AE1EA /* Models */,
-                8A9F7C4A2AC332BF008AE1EA /* UI */,
+				8A9F7C4A2AC332BF008AE1EA /* UI */,
-                8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
+				8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
-                8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
+				8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
-                8A1C837C2AC328BE0096AF73 /* Preview Content */,
+				8A1C837C2AC328BE0096AF73 /* Preview Content */,
-            );
+			);
-            path = llama.swiftui;
+			path = llama.swiftui;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
+		8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
+				8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
-            );
+			);
-            path = "Preview Content";
+			path = "Preview Content";
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A39BE082AC7601000BFEB40 /* Frameworks */ = {
+		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                549479CA2AC9E16000E0F78B /* Metal.framework */,
+				549479CA2AC9E16000E0F78B /* Metal.framework */,
-                8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
+				8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
-            );
+			);
-            name = Frameworks;
+			name = Frameworks;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A3F84102AC4BD85005E2EE8 /* Resources */ = {
+		8A3F84102AC4BD85005E2EE8 /* Resources */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A3F84112AC4BD8C005E2EE8 /* models */,
+				8A3F84112AC4BD8C005E2EE8 /* models */,
-            );
+			);
-            path = Resources;
+			path = Resources;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A3F84112AC4BD8C005E2EE8 /* models */ = {
+		8A3F84112AC4BD8C005E2EE8 /* models */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
+			);
-            );
+			path = models;
-            path = models;
+			sourceTree = "<group>";
-            sourceTree = "<group>";
+		};
-        };
+		8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
-        8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
+			isa = PBXGroup;
-            isa = PBXGroup;
+			children = (
-            children = (
+				8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
-                8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
+				8A907F322AC7134E006146EA /* LibLlama.swift */,
-                8A907F322AC7134E006146EA /* LibLlama.swift */,
+			);
-            );
+			path = llama.cpp.swift;
-            path = llama.cpp.swift;
+			sourceTree = "<group>";
-            sourceTree = "<group>";
+		};
-        };
+		8A9F7C4A2AC332BF008AE1EA /* UI */ = {
-        8A9F7C4A2AC332BF008AE1EA /* UI */ = {
+			isa = PBXGroup;
-            isa = PBXGroup;
+			children = (
-            children = (
+				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
-                8A1C83782AC328BD0096AF73 /* ContentView.swift */,
+				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
-            );
+			);
-            path = UI;
+			path = UI;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A9F7C4B2AC332DC008AE1EA /* Models */ = {
+		8A9F7C4B2AC332DC008AE1EA /* Models */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
+				8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
-            );
+			);
-            path = Models;
+			path = Models;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
 /* End PBXGroup section */
 /* Begin PBXNativeTarget section */
-        8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
+		8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
-            isa = PBXNativeTarget;
+			isa = PBXNativeTarget;
-            buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
+			buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
-            buildPhases = (
+			buildPhases = (
-                8A1C836F2AC328BD0096AF73 /* Sources */,
+				8A1C836F2AC328BD0096AF73 /* Sources */,
-                8A1C83702AC328BD0096AF73 /* Frameworks */,
+				8A1C83702AC328BD0096AF73 /* Frameworks */,
-                8A1C83712AC328BD0096AF73 /* Resources */,
+				8A1C83712AC328BD0096AF73 /* Resources */,
-            );
+			);
-            buildRules = (
+			buildRules = (
-            );
+			);
-            dependencies = (
+			dependencies = (
-            );
+			);
-            name = llama.swiftui;
+			name = llama.swiftui;
-            packageProductDependencies = (
+			packageProductDependencies = (
-            );
+			);
-            productName = llama.swiftui;
+			productName = llama.swiftui;
-            productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
+			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
-            productType = "com.apple.product-type.application";
+			productType = "com.apple.product-type.application";
-        };
+		};
 /* End PBXNativeTarget section */
 /* Begin PBXProject section */
-        8A1C836B2AC328BD0096AF73 /* Project object */ = {
+		8A1C836B2AC328BD0096AF73 /* Project object */ = {
-            isa = PBXProject;
+			isa = PBXProject;
-            attributes = {
+			attributes = {
-                BuildIndependentTargetsInParallel = 1;
+				BuildIndependentTargetsInParallel = 1;
-                LastSwiftUpdateCheck = 1500;
+				LastSwiftUpdateCheck = 1500;
-                LastUpgradeCheck = 1500;
+				LastUpgradeCheck = 1500;
-                TargetAttributes = {
+				TargetAttributes = {
-                    8A1C83722AC328BD0096AF73 = {
+					8A1C83722AC328BD0096AF73 = {
-                        CreatedOnToolsVersion = 15.0;
+						CreatedOnToolsVersion = 15.0;
-                        LastSwiftMigration = 1500;
+						LastSwiftMigration = 1500;
-                    };
+					};
-                };
+				};
-            };
+			};
-            buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
+			buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
-            compatibilityVersion = "Xcode 14.0";
+			compatibilityVersion = "Xcode 14.0";
-            developmentRegion = en;
+			developmentRegion = en;
-            hasScannedForEncodings = 0;
+			hasScannedForEncodings = 0;
-            knownRegions = (
+			knownRegions = (
-                en,
+				en,
-                Base,
+				Base,
-            );
+			);
-            mainGroup = 8A1C836A2AC328BD0096AF73;
+			mainGroup = 8A1C836A2AC328BD0096AF73;
-            packageReferences = (
+			packageReferences = (
-            );
+			);
-            productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
+			productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
-            projectDirPath = "";
+			projectDirPath = "";
-            projectRoot = "";
+			projectRoot = "";
-            targets = (
+			targets = (
-                8A1C83722AC328BD0096AF73 /* llama.swiftui */,
+				8A1C83722AC328BD0096AF73 /* llama.swiftui */,
-            );
+			);
-        };
+		};
 /* End PBXProject section */
 /* Begin PBXResourcesBuildPhase section */
-        8A1C83712AC328BD0096AF73 /* Resources */ = {
+		8A1C83712AC328BD0096AF73 /* Resources */ = {
-            isa = PBXResourcesBuildPhase;
+			isa = PBXResourcesBuildPhase;
-            buildActionMask = 2147483647;
+			buildActionMask = 2147483647;
-            files = (
+			files = (
-                542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
+				542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
-                8A3F84242AC4C891005E2EE8 /* models in Resources */,
+				8A3F84242AC4C891005E2EE8 /* models in Resources */,
-                8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
+				8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
-                8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
+				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
-            );
+			);
-            runOnlyForDeploymentPostprocessing = 0;
+			runOnlyForDeploymentPostprocessing = 0;
-        };
+		};
 /* End PBXResourcesBuildPhase section */
 /* Begin PBXSourcesBuildPhase section */
-        8A1C836F2AC328BD0096AF73 /* Sources */ = {
+		8A1C836F2AC328BD0096AF73 /* Sources */ = {
-            isa = PBXSourcesBuildPhase;
+			isa = PBXSourcesBuildPhase;
-            buildActionMask = 2147483647;
+			buildActionMask = 2147483647;
-            files = (
+			files = (
-                542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
+				542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
-                549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
+				549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
-                542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
+				542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
-                8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
+				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
-                542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
+				542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
-                8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
+				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
-                8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
+				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
-                8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
+				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
-                542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
+				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
-                5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
+				542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
-            );
+				5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
-            runOnlyForDeploymentPostprocessing = 0;
+			);
-        };
+			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXSourcesBuildPhase section */
 /* Begin XCBuildConfiguration section */
-        8A1C837F2AC328BE0096AF73 /* Debug */ = {
+		8A1C837F2AC328BE0096AF73 /* Debug */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ALWAYS_SEARCH_USER_PATHS = NO;
+				ALWAYS_SEARCH_USER_PATHS = NO;
-                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-                CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NONNULL = YES;
-                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
-                CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
-                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-                CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
-                CLANG_WARN_COMMA = YES;
+				CLANG_WARN_COMMA = YES;
-                CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
-                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-                CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
-                CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
-                CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
-                CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
-                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-                CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
-                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-                CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
-                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-                COPY_PHASE_STRIP = NO;
+				COPY_PHASE_STRIP = NO;
-                DEBUG_INFORMATION_FORMAT = dwarf;
+				DEBUG_INFORMATION_FORMAT = dwarf;
-                ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
-                ENABLE_TESTABILITY = YES;
+				ENABLE_TESTABILITY = YES;
-                ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-                GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
-                GCC_DYNAMIC_NO_PIC = NO;
+				GCC_DYNAMIC_NO_PIC = NO;
-                GCC_NO_COMMON_BLOCKS = YES;
+				GCC_NO_COMMON_BLOCKS = YES;
-                GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_OPTIMIZATION_LEVEL = 0;
-                GCC_PREPROCESSOR_DEFINITIONS = (
+				GCC_PREPROCESSOR_DEFINITIONS = (
-                    "DEBUG=1",
+					"DEBUG=1",
-                    "$(inherited)",
+					"$(inherited)",
-                );
+				);
-                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-                GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
-                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-                GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_FUNCTION = YES;
-                GCC_WARN_UNUSED_VARIABLE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
-                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-                MTL_FAST_MATH = YES;
+				MTL_FAST_MATH = YES;
-                ONLY_ACTIVE_ARCH = YES;
+				ONLY_ACTIVE_ARCH = YES;
-                SDKROOT = iphoneos;
+				SDKROOT = iphoneos;
-                SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
-                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-            };
+			};
-            name = Debug;
+			name = Debug;
-        };
+		};
-        8A1C83802AC328BE0096AF73 /* Release */ = {
+		8A1C83802AC328BE0096AF73 /* Release */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ALWAYS_SEARCH_USER_PATHS = NO;
+				ALWAYS_SEARCH_USER_PATHS = NO;
-                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-                CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NONNULL = YES;
-                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
-                CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
-                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-                CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
-                CLANG_WARN_COMMA = YES;
+				CLANG_WARN_COMMA = YES;
-                CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
-                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-                CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
-                CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
-                CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
-                CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
-                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-                CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
-                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-                CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
-                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-                COPY_PHASE_STRIP = NO;
+				COPY_PHASE_STRIP = NO;
-                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-                ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_NS_ASSERTIONS = NO;
-                ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
-                ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-                GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
-                GCC_NO_COMMON_BLOCKS = YES;
+				GCC_NO_COMMON_BLOCKS = YES;
-                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-                GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
-                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-                GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_FUNCTION = YES;
-                GCC_WARN_UNUSED_VARIABLE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
-                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-                MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_ENABLE_DEBUG_INFO = NO;
-                MTL_FAST_MATH = YES;
+				MTL_FAST_MATH = YES;
-                SDKROOT = iphoneos;
+				SDKROOT = iphoneos;
-                SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_COMPILATION_MODE = wholemodule;
-                VALIDATE_PRODUCT = YES;
+				VALIDATE_PRODUCT = YES;
-            };
+			};
-            name = Release;
+			name = Release;
-        };
+		};
-        8A1C83822AC328BE0096AF73 /* Debug */ = {
+		8A1C83822AC328BE0096AF73 /* Debug */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_STYLE = Automatic;
-                CURRENT_PROJECT_VERSION = 1;
+				CURRENT_PROJECT_VERSION = 1;
-                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
-                DEVELOPMENT_TEAM = STLSG3FG8Q;
+				DEVELOPMENT_TEAM = STLSG3FG8Q;
-                ENABLE_PREVIEWS = YES;
+				ENABLE_PREVIEWS = YES;
-                GENERATE_INFOPLIST_FILE = YES;
+				GENERATE_INFOPLIST_FILE = YES;
-                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-                LD_RUNPATH_SEARCH_PATHS = (
+				LD_RUNPATH_SEARCH_PATHS = (
-                    "$(inherited)",
+					"$(inherited)",
-                    "@executable_path/Frameworks",
+					"@executable_path/Frameworks",
-                );
+				);
-                MARKETING_VERSION = 1.0;
+				MARKETING_VERSION = 1.0;
-                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-                PRODUCT_NAME = "$(TARGET_NAME)";
+				PRODUCT_NAME = "$(TARGET_NAME)";
-                SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
-                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
-                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-                SWIFT_VERSION = 5.0;
+				SWIFT_VERSION = 5.0;
-                TARGETED_DEVICE_FAMILY = "1,2";
+				TARGETED_DEVICE_FAMILY = "1,2";
-            };
+			};
-            name = Debug;
+			name = Debug;
-        };
+		};
-        8A1C83832AC328BE0096AF73 /* Release */ = {
+		8A1C83832AC328BE0096AF73 /* Release */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_STYLE = Automatic;
-                CURRENT_PROJECT_VERSION = 1;
+				CURRENT_PROJECT_VERSION = 1;
-                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
-                DEVELOPMENT_TEAM = STLSG3FG8Q;
+				DEVELOPMENT_TEAM = STLSG3FG8Q;
-                ENABLE_PREVIEWS = YES;
+				ENABLE_PREVIEWS = YES;
-                GENERATE_INFOPLIST_FILE = YES;
+				GENERATE_INFOPLIST_FILE = YES;
-                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-                LD_RUNPATH_SEARCH_PATHS = (
+				LD_RUNPATH_SEARCH_PATHS = (
-                    "$(inherited)",
+					"$(inherited)",
-                    "@executable_path/Frameworks",
+					"@executable_path/Frameworks",
-                );
+				);
-                MARKETING_VERSION = 1.0;
+				MARKETING_VERSION = 1.0;
-                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-                PRODUCT_NAME = "$(TARGET_NAME)";
+				PRODUCT_NAME = "$(TARGET_NAME)";
-                SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
-                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
-                SWIFT_VERSION = 5.0;
+				SWIFT_VERSION = 5.0;
-                TARGETED_DEVICE_FAMILY = "1,2";
+				TARGETED_DEVICE_FAMILY = "1,2";
-            };
+			};
-            name = Release;
+			name = Release;
-        };
+		};
 /* End XCBuildConfiguration section */
 /* Begin XCConfigurationList section */
-        8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
+		8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
-            isa = XCConfigurationList;
+			isa = XCConfigurationList;
-            buildConfigurations = (
+			buildConfigurations = (
-                8A1C837F2AC328BE0096AF73 /* Debug */,
+				8A1C837F2AC328BE0096AF73 /* Debug */,
-                8A1C83802AC328BE0096AF73 /* Release */,
+				8A1C83802AC328BE0096AF73 /* Release */,
-            );
+			);
-            defaultConfigurationIsVisible = 0;
+			defaultConfigurationIsVisible = 0;
-            defaultConfigurationName = Release;
+			defaultConfigurationName = Release;
-        };
+		};
-        8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
+		8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
-            isa = XCConfigurationList;
+			isa = XCConfigurationList;
-            buildConfigurations = (
+			buildConfigurations = (
-                8A1C83822AC328BE0096AF73 /* Debug */,
+				8A1C83822AC328BE0096AF73 /* Debug */,
-                8A1C83832AC328BE0096AF73 /* Release */,
+				8A1C83832AC328BE0096AF73 /* Release */,
-            );
+			);
-            defaultConfigurationIsVisible = 0;
+			defaultConfigurationIsVisible = 0;
-            defaultConfigurationName = Release;
+			defaultConfigurationName = Release;
-        };
+		};
 /* End XCConfigurationList section */
-    };
+	};
-    rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
+	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
 }
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -3,24 +3,26 @@ import Foundation
@MainActor
 class LlamaState: ObservableObject {
    @Published var messageLog = ""
    @Published var cacheCleared = false
    private var llamaContext: LlamaContext?
-    private var modelUrl: URL? {
+    private var defaultModelUrl: URL? {
-        Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
+        Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
    }
    init() {
        do {
-            try loadModel()
+            try loadModel(modelUrl: defaultModelUrl)
        } catch {
            messageLog += "Error!\n"
        }
    }
-    private func loadModel() throws {
+    func loadModel(modelUrl: URL?) throws {
        messageLog += "Loading model...\n"
        if let modelUrl {
-            llamaContext = try LlamaContext.createContext(path: modelUrl.path())
+            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
        } else {
            messageLog += "Could not locate model\n"
@ -31,7 +33,7 @@ class LlamaState: ObservableObject {
        guard let llamaContext else {
            return
        }
-        messageLog += "Attempting to complete text...\n"
+
        await llamaContext.completion_init(text: text)
        messageLog += "\(text)"
@ -42,4 +44,42 @@ class LlamaState: ObservableObject {
        await llamaContext.clear()
        messageLog += "\n\ndone\n"
    }
    func bench() async {
        guard let llamaContext else {
            return
        }
        messageLog += "\n"
        messageLog += "Running benchmark...\n"
        messageLog += "Model info: "
        messageLog += await llamaContext.model_info() + "\n"
        let t_start = DispatchTime.now().uptimeNanoseconds
        await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
        let t_end = DispatchTime.now().uptimeNanoseconds
        let t_heat = Double(t_end - t_start) / 1_000_000_000.0
        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
        // if more than 5 seconds, then we're probably running on a slow device
        if t_heat > 5.0 {
            messageLog += "Heat up time is too long, aborting benchmark\n"
            return
        }
        let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
        messageLog += "\(result)"
        messageLog += "\n"
    }
    func clear() async {
        guard let llamaContext else {
            return
        }
        await llamaContext.clear()
        messageLog = ""
    }
 }
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@ -5,24 +5,132 @@ struct ContentView: View {
    @State private var multiLineText = ""
    private static func cleanupModelCaches() {
        // Delete all models (*.gguf)
        let fileManager = FileManager.default
        let documentsUrl =  FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
        do {
            let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
            for fileURL in fileURLs {
                if fileURL.pathExtension == "gguf" {
                    try fileManager.removeItem(at: fileURL)
                }
            }
        } catch {
            print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
        }
    }
    var body: some View {
        VStack {
-            ScrollView(.vertical) {
+            ScrollView(.vertical, showsIndicators: true) {
                Text(llamaState.messageLog)
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                .padding()
                .onTapGesture {
                    UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
                }
            }
            TextEditor(text: $multiLineText)
-                .frame(height: 200)
+                .frame(height: 80)
                .padding()
                .border(Color.gray, width: 0.5)
-            Button(action: {
+
-                sendText()
+            HStack {
-            }) {
+                Button("Send") {
-                Text("Send")
+                    sendText()
-                    .padding()
+                }
-                    .background(Color.blue)
+                .padding(8)
-                    .foregroundColor(.white)
+                .background(Color.blue)
-                    .cornerRadius(8)
+                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Bench") {
                    bench()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Clear") {
                    clear()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Copy") {
                    UIPasteboard.general.string = llamaState.messageLog
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
            }
            VStack {
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
                )
                .font(.system(size: 12))
                .padding(.top, 4)
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
                )
                .font(.system(size: 12))
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
                    filename: "tinyllama-1.1b-f16.gguf"
                )
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
                    filename: "phi-2-q4_0.gguf"
                )
                .font(.system(size: 12))
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
                    filename: "phi-2-q8_0.gguf"
                )
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
                    filename: "mistral-7b-v0.1.Q4_0.gguf"
                )
                .font(.system(size: 12))
                Button("Clear downloaded models") {
                    ContentView.cleanupModelCaches()
                    llamaState.cacheCleared = true
                }
                .padding(8)
                .font(.system(size: 12))
            }
        }
        .padding()
@ -34,9 +142,20 @@ struct ContentView: View {
            multiLineText = ""
        }
    }
    func bench() {
        Task {
            await llamaState.bench()
        }
    }
    func clear() {
        Task {
            await llamaState.clear()
        }
    }
 }
-/*
+
-#Preview {
+//#Preview {
-    ContentView()
+//    ContentView()
-}
+//}
 */
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@ -0,0 +1,122 @@
 import SwiftUI
 struct DownloadButton: View {
    @ObservedObject private var llamaState: LlamaState
    private var modelName: String
    private var modelUrl: String
    private var filename: String
    @State private var status: String
    @State private var downloadTask: URLSessionDownloadTask?
    @State private var progress = 0.0
    @State private var observation: NSKeyValueObservation?
    private static func getFileURL(filename: String) -> URL {
        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
    }
    private func checkFileExistenceAndUpdateStatus() {
    }
    init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
        self.llamaState = llamaState
        self.modelName = modelName
        self.modelUrl = modelUrl
        self.filename = filename
        let fileURL = DownloadButton.getFileURL(filename: filename)
        status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
    }
    private func download() {
        status = "downloading"
        print("Downloading model \(modelName) from \(modelUrl)")
        guard let url = URL(string: modelUrl) else { return }
        let fileURL = DownloadButton.getFileURL(filename: filename)
        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
            if let error = error {
                print("Error: \(error.localizedDescription)")
                return
            }
            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
                print("Server error!")
                return
            }
            do {
                if let temporaryURL = temporaryURL {
                    try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
                    print("Writing to \(filename) completed")
                    llamaState.cacheCleared = false
                    status = "downloaded"
                }
            } catch let err {
                print("Error: \(err.localizedDescription)")
            }
        }
        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
            self.progress = progress.fractionCompleted
        }
        downloadTask?.resume()
    }
    var body: some View {
        VStack {
            if status == "download" {
                Button(action: download) {
                    Text("Download " + modelName)
                }
            } else if status == "downloading" {
                Button(action: {
                    downloadTask?.cancel()
                    status = "download"
                }) {
                    Text("\(modelName) (Downloading \(Int(progress * 100))%)")
                }
            } else if status == "downloaded" {
                Button(action: {
                    let fileURL = DownloadButton.getFileURL(filename: filename)
                    if !FileManager.default.fileExists(atPath: fileURL.path) {
                        download()
                        return
                    }
                    do {
                        try llamaState.loadModel(modelUrl: fileURL)
                    } catch let err {
                        print("Error: \(err.localizedDescription)")
                    }
                }) {
                    Text("\(modelName) (Downloaded)")
                }
            } else {
                Text("Unknown status")
            }
        }
        .onDisappear() {
            downloadTask?.cancel()
        }
        .onChange(of: llamaState.cacheCleared) { newValue in
            if newValue {
                downloadTask?.cancel()
                let fileURL = DownloadButton.getFileURL(filename: filename)
                status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
            }
        }
    }
 }
 // #Preview {
 //    DownloadButton(
 //        llamaState: LlamaState(),
 //        modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
 //        modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
 //        filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
 //    )
 // }
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            ctx_size += padded_size;
            if (verbosity >= 3) {
                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
-                       cur->n_dims, cur->name, tensor_size, padded_size, offset);
+                       ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
            }
        }
    }
@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        }
        // quantize only 2D tensors
-        quantize &= (cur->n_dims == 2);
+        quantize &= (ggml_n_dims(cur) == 2);
        if (quantize) {
            new_type = type;
@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }
-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
    headers: {
      'Connection': 'keep-alive',
      'Content-Type': 'application/json',
-      'Accept': 'text/event-stream'
+      'Accept': 'text/event-stream',
      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
    },
    signal: controller.signal,
  });
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -235,7 +235,8 @@
      grammar: '',
      n_probs: 0, // no completion_probabilities,
      image_data: [],
-      cache_prompt: true
+      cache_prompt: true,
      api_key: ''
    })
    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
@ -790,6 +791,10 @@
            <fieldset>
              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
            </fieldset>
            <fieldset>
              <label for="api_key">API Key</label>
              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
            </fieldset>
          </details>
        </form>
      `
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -10,7 +10,8 @@
 // crash the server in debug mode, otherwise send an http 500 error
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
 #endif
-
+// increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 #include "httplib.h"
 #include "json.hpp"
@ -36,6 +37,7 @@ using json = nlohmann::json;
 struct server_params
 {
    std::string hostname = "127.0.0.1";
    std::string api_key;
    std::string public_path = "examples/server/public";
    int32_t port = 8080;
    int32_t read_timeout = 600;
@ -1953,6 +1955,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
    printf("  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
    printf("  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
    printf("  --api-key API_KEY     optional api key to enhance server security. If set, requests must include this key for access.\n");
    printf("  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
    printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N   number of slots for process requests (default: %d)\n", params.n_parallel);
@ -2002,6 +2005,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            sparams.public_path = argv[i];
        }
        else if (arg == "--api-key")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            sparams.api_key = argv[i];
        }
        else if (arg == "--timeout" || arg == "-to")
        {
            if (++i >= argc)
@ -2402,7 +2414,7 @@ json oaicompat_completion_params_parse(
    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
-    if (llama_params.count("grammar") != 0) {
+    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
    }
@ -2633,6 +2645,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
    log_disable();
 #endif
    // own arguments required by this example
    gpt_params params;
    server_params sparams;
@ -2669,6 +2684,32 @@ int main(int argc, char **argv)
    httplib::Server svr;
    // Middleware for API key validation
    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
        // If API key is not set, skip validation
        if (sparams.api_key.empty()) {
            return true;
        }
        // Check for API key in the header
        auto auth_header = req.get_header_value("Authorization");
        std::string prefix = "Bearer ";
        if (auth_header.substr(0, prefix.size()) == prefix) {
            std::string received_api_key = auth_header.substr(prefix.size());
            if (received_api_key == sparams.api_key) {
                return true; // API key is valid
            }
        }
        // API key is invalid or not provided
        res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
        res.status = 401; // Unauthorized
        LOG_WARNING("Unauthorized: Invalid API Key", {});
        return false;
    };
    svr.set_default_headers({{"Server", "llama.cpp"},
                             {"Access-Control-Allow-Origin", "*"},
                             {"Access-Control-Allow-Headers", "content-type"}});
@ -2676,28 +2717,28 @@ int main(int argc, char **argv)
    // this is only called if no index.html is found in the public --path
    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
+                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
                return false;
            });
    // this is only called if no index.js is found in the public --path
    svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
+                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
                return false;
            });
    // this is only called if no index.html is found in the public --path
    svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
+                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
                return false;
            });
    // this is only called if no index.html is found in the public --path
    svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
+                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
                return false;
            });
@ -2708,23 +2749,26 @@ int main(int argc, char **argv)
                    { "user_name",      llama.name_user.c_str() },
                    { "assistant_name", llama.name_assistant.c_str() }
                };
-                res.set_content(data.dump(), "application/json");
+                res.set_content(data.dump(), "application/json; charset=utf-8");
            });
-    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
+    svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
                if (!validate_api_key(req, res)) {
                    return;
                }
                json data = json::parse(req.body);
                const int task_id = llama.request_completion(data, false, false, -1);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
                    if (!result.error && result.stop) {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
                    }
                    else
                    {
                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain");
+                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
                        return;
                    }
                } else {
@ -2795,12 +2839,15 @@ int main(int argc, char **argv)
                    }}
                };
-                res.set_content(models.dump(), "application/json");
+                res.set_content(models.dump(), "application/json; charset=utf-8");
            });
    // TODO: add mount point without "/v1" prefix -- how?
-    svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
+    svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
                if (!validate_api_key(req, res)) {
                    return;
                }
                json data = oaicompat_completion_params_parse(json::parse(req.body));
                const int task_id = llama.request_completion(data, false, false, -1);
@ -2814,10 +2861,10 @@ int main(int argc, char **argv)
                        res.set_content(oaicompat_result.dump(-1, ' ', false,
                                            json::error_handler_t::replace),
-                                            "application/json");
+                                            "application/json; charset=utf-8");
                    } else {
                        res.status = 500;
-                        res.set_content(result.result_json["content"], "text/plain");
+                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
                        return;
                    }
                } else {
@ -2869,8 +2916,11 @@ int main(int argc, char **argv)
                }
            });
-    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
+    svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
                if (!validate_api_key(req, res)) {
                    return;
                }
                json data = json::parse(req.body);
                const int task_id = llama.request_completion(data, true, false, -1);
                if (!json_value(data, "stream", false)) {
@ -2878,12 +2928,12 @@ int main(int argc, char **argv)
                    task_result result = llama.next_result(task_id);
                    if (!result.error && result.stop)
                    {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
                    }
                    else
                    {
                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain");
+                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
                        return;
                    }
                } else {
@ -2932,11 +2982,11 @@ int main(int argc, char **argv)
    svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
            {
                const json data = llama.get_model_props();
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(), "application/json; charset=utf-8");
            });
    svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
-                { return res.set_content("", "application/json"); });
+                { return res.set_content("", "application/json; charset=utf-8"); });
    svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
            {
@ -2947,7 +2997,7 @@ int main(int argc, char **argv)
                    tokens = llama.tokenize(body["content"], false);
                }
                const json data = format_tokenizer_response(tokens);
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(), "application/json; charset=utf-8");
            });
    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
@ -2961,7 +3011,7 @@ int main(int argc, char **argv)
                }
                const json data = format_detokenized_response(content);
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(), "application/json; charset=utf-8");
            });
    svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
@ -2978,7 +3028,7 @@ int main(int argc, char **argv)
                }
                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
                task_result result = llama.next_result(task_id);
-                return res.set_content(result.result_json.dump(), "application/json");
+                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });
    svr.set_logger(log_server_request);
@ -2999,19 +3049,23 @@ int main(int argc, char **argv)
                {
                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
                }
-                res.set_content(buf, "text/plain");
+                res.set_content(buf, "text/plain; charset=utf-8");
                res.status = 500;
            });
    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
            {
                if (res.status == 401)
                {
                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
                }
                if (res.status == 400)
                {
-                    res.set_content("Invalid request", "text/plain");
+                    res.set_content("Invalid request", "text/plain; charset=utf-8");
                }
-                else if (res.status != 500)
+                else if (res.status == 404)
                {
-                    res.set_content("File Not Found", "text/plain");
+                    res.set_content("File Not Found", "text/plain; charset=utf-8");
                    res.status = 404;
                }
            });
@ -3032,11 +3086,15 @@ int main(int argc, char **argv)
    // to make it ctrl+clickable:
    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
-    LOG_INFO("HTTP server listening", {
+    std::unordered_map<std::string, std::string> log_data;
-                                          {"hostname", sparams.hostname},
+    log_data["hostname"] = sparams.hostname;
-                                          {"port", sparams.port},
+    log_data["port"] = std::to_string(sparams.port);
                                      });
    if (!sparams.api_key.empty()) {
        log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
    }
    LOG_INFO("HTTP server listening", log_data);
    // run the HTTP server in a thread - see comment below
    std::thread t([&]()
            {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -31,6 +31,7 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
@ -40,6 +41,7 @@
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@ -4998,7 +5000,16 @@ static __global__ void rope_neox(
    const int ib = col / n_dims;
    const int ic = col % n_dims;
-    const int i = row*ncols + ib*n_dims + ic/2;
+    if (ib > 0) {
        const int i = row*ncols + ib*n_dims + ic;
        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];
        return;
    }
    const int i  = row*ncols + ib*n_dims + ic/2;
    const int i2 = row/p_delta_rows;
    float cur_rot = inv_ndims * ic - ib;
@ -7057,6 +7068,7 @@ inline void ggml_cuda_op_upscale(
    (void) src1;
    (void) dst;
    (void) src1_dd;
 }
 inline void ggml_cuda_op_pad(
@ -7073,6 +7085,7 @@ inline void ggml_cuda_op_pad(
    (void) src1;
    (void) dst;
    (void) src1_dd;
 }
 inline void ggml_cuda_op_rms_norm(
@ -7376,7 +7389,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
    const int compute_capability = g_compute_capabilities[id];
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
        half * src0_as_f16 = nullptr;
        size_t src0_as = 0;
@ -8300,27 +8313,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
 }
 static __global__ void k_compute_batched_ptrs(
-        const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
+        const half * src0_as_f16, const half * src1_as_f16, char * dst,
        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
+        int64_t ne12, int64_t ne13,
-        int ne23,
+        int64_t ne23,
-        int nb02, int nb03,
+        size_t  nb02, size_t  nb03,
-        int nb12, int nb13,
+        size_t  nb12, size_t  nb13,
-        int nb2, int nb3,
+        size_t  nbd2, size_t  nbd3,
-        int r2, int r3) {
+        int64_t r2,   int64_t r3) {
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
+    int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y;
    if (i13 >= ne13 || i12 >= ne12) {
        return;
    }
-    int i03 = i13 / r3;
+    int64_t i03 = i13 / r3;
-    int i02 = i12 / r2;
+    int64_t i02 = i12 / r2;
    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)     dst_f16 + i12* nb2/2 + i13* nb3/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2   + i13*nbd3;
 }
 static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -8376,7 +8389,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+
    half * dst_f16 = nullptr;
    char * dst_t   = nullptr;
    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
    cudaDataType_t      cu_data_type    = CUDA_R_16F;
    // dst strides
    size_t nbd2 = dst->nb[2];
    size_t nbd3 = dst->nb[3];
    const half  alpha_f16 = 1.0f;
    const half  beta_f16  = 0.0f;
    const float alpha_f32 = 1.0f;
    const float beta_f32  = 0.0f;
    const void * alpha = &alpha_f16;
    const void * beta  = &beta_f16;
    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
        dst_t   = (char *) dst_f16;
        nbd2 /= sizeof(float) / sizeof(half);
        nbd3 /= sizeof(float) / sizeof(half);
    } else {
        dst_t = (char *) dst_ddf;
        cu_compute_type = CUBLAS_COMPUTE_32F;
        cu_data_type    = CUDA_R_32F;
        alpha = &alpha_f32;
        beta  = &beta_f32;
    }
    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);
@ -8385,9 +8432,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;
    const half alpha_f16 = 1.0f;
    const half beta_f16  = 0.0f;
 #if 0
    // use cublasGemmEx
    {
@ -8397,12 +8441,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
                int i02 = i12 / r2;
                CUBLAS_CHECK(
-                        cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                            ne01, ne11, ne10,
-                            &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F, nb01/sizeof(half),
+                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
-                                        (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
+                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
-                            &beta_f16,  (      char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
+                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
-                            CUBLAS_COMPUTE_16F,
+                            cu_compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
            }
        }
@ -8414,11 +8458,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
+                alpha, (const char *) src0_as_f16, CUDA_R_16F,   nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
-                            (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
+                       (const char *) src1_as_f16, CUDA_R_16F,   nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
-                &beta_f16,  (      char *)     dst_f16, CUDA_R_16F, ne01,                dst->nb[2]/sizeof(float), // strideC
+                beta,  (      char *)       dst_t, cu_data_type, ne01,                dst->nb[2]/sizeof(float), // strideC
                ne12*ne13,
-                CUBLAS_COMPUTE_16F,
+                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
    } else {
        // use cublasGemmBatchedEx
@ -8435,24 +8479,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-                src0_as_f16, src1_as_f16, dst_f16,
+                src0_as_f16, src1_as_f16, dst_t,
                ptrs_src, ptrs_dst,
                ne12, ne13,
                ne23,
                nb02, nb03,
                nb12, nb13,
-                dst->nb[2], dst->nb[3],
+                nbd2, nbd3,
                r2, r3);
        CUDA_CHECK(cudaGetLastError());
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
+                alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F,   nb01/sizeof(half),
-                            (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
+                       (const void **) (ptrs_src + 1*ne23), CUDA_R_16F,   nb11/sizeof(float),
-                &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+                beta,  (      void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
                ne23,
-                CUBLAS_COMPUTE_16F,
+                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
        if (ptrs_src_s != 0) {
@ -8464,11 +8508,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    }
 #endif
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
        ggml_cuda_pool_free(dst_f16, dst_as);
    }
    ggml_cuda_pool_free(src1_as_f16, src1_as);
    ggml_cuda_pool_free(dst_f16, dst_as);
 }
 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -8898,6 +8945,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg
    (void) dst;
 }
 static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 }
 void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
    const int64_t nrows = ggml_nrows(tensor);
@ -8947,8 +9000,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
        }
        char * buf;
@ -9485,8 +9537,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
    if (ggml_is_quantized(tensor->type)) {
        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
                * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
        }
    }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -1702,8 +1702,9 @@ kernel void kernel_rope(
            dst_data[1] = x0*sin_theta + x1*cos_theta;
        }
    } else {
-        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+        for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
-            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
+            if (ic < n_dims) {
                const int64_t ib = 0;
                // simplified from `(ib * n_dims + ic) * inv_ndims`
                const float cur_rot = inv_ndims*ic - ib;
@ -1722,6 +1723,14 @@ kernel void kernel_rope(
                dst_data[0]        = x0*cos_theta - x1*sin_theta;
                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
            } else {
                const int64_t i0 = ic;
                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                dst_data[0] = src[0];
                dst_data[1] = src[1];
            }
        }
    }
--- a/ggml.c
+++ b/ggml.c
@ -1997,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
 }
 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type);
 }
 int ggml_blck_size(enum ggml_type type) {
    return type_traits[type].blck_size;
 }
@ -2054,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) {
    return ggml_type_size(tensor->type);
 }
-static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
+bool ggml_is_scalar(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
 }
-static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
+bool ggml_is_vector(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
 }
-static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
+bool ggml_is_matrix(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
 }
 bool ggml_is_3d(const struct ggml_tensor * tensor) {
    return tensor->ne[3] == 1;
 }
 int ggml_n_dims(const struct ggml_tensor * tensor) {
    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
        if (tensor->ne[i] > 1) {
            return i + 1;
        }
    }
    return 1;
 }
 static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@ -2478,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        view_src   = view_src->view_src;
    }
-    size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type));
+    size_t data_size = ggml_row_size(type, ne[0]);
    for (int i = 1; i < n_dims; i++) {
        data_size *= ne[i];
    }
@ -2521,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        /*.type         =*/ type,
        /*.backend      =*/ GGML_BACKEND_CPU,
        /*.buffer       =*/ NULL,
        /*.n_dims       =*/ n_dims,
        /*.ne           =*/ { 1, 1, 1, 1 },
        /*.nb           =*/ { 0, 0, 0, 0 },
        /*.op           =*/ GGML_OP_NONE,
@ -2628,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
 }
 struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
-    return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne);
+    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
 }
 static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
@ -3077,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char *
 struct ggml_tensor * ggml_view_tensor(
        struct ggml_context * ctx,
        struct ggml_tensor  * src) {
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
    ggml_format_name(result, "%s (view)", src->name);
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
@ -3235,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl(
        is_node = true;
    }
-    struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL;
+    result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL;
    result->src[0] = a;
    result->src[1] = b;
@ -3607,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows(
        is_node = true;
    }
-    int64_t ne[4] = {1,1,1,1};
+    int64_t ne[GGML_MAX_DIMS] = { 1 };
-    for (int i=1; i<a->n_dims; ++i) {
+    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
        ne[i] = a->ne[i];
    }
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
    result->op   = GGML_OP_SUM_ROWS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -3633,8 +3639,8 @@ struct ggml_tensor * ggml_mean(
        is_node = true;
    }
-    int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
+    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    result->op   = GGML_OP_MEAN;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -3656,8 +3662,7 @@ struct ggml_tensor * ggml_argmax(
        is_node = true;
    }
-    int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 };
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne);
    result->op   = GGML_OP_ARGMAX;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -3680,7 +3685,7 @@ struct ggml_tensor * ggml_repeat(
        is_node = true;
    }
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
    result->op   = GGML_OP_REPEAT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -3707,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back(
        return a;
    }
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
    result->op   = GGML_OP_REPEAT_BACK;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -4083,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat(
    }
    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    result->op   = GGML_OP_MUL_MAT;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -4093,6 +4098,14 @@ struct ggml_tensor * ggml_mul_mat(
    return result;
 }
 void ggml_mul_mat_set_prec(
        struct ggml_tensor * a,
        enum ggml_prec       prec) {
    const int32_t prec_i32 = (int32_t) prec;
    ggml_set_op_params_i32(a, 0, prec_i32);
 }
 // ggml_mul_mat_id
 struct ggml_tensor * ggml_mul_mat_id(
@ -4117,7 +4130,7 @@ struct ggml_tensor * ggml_mul_mat_id(
    }
    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    ggml_set_op_params_i32(result, 0, id);
    ggml_set_op_params_i32(result, 1, n_as);
@ -4155,7 +4168,7 @@ struct ggml_tensor * ggml_out_prod(
    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    result->op   = GGML_OP_OUT_PROD;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -4440,7 +4453,7 @@ struct ggml_tensor * ggml_reshape(
        //GGML_ASSERT(false);
    }
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);
    result->op   = GGML_OP_RESHAPE;
@ -4818,7 +4831,7 @@ struct ggml_tensor * ggml_diag(
    }
    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
    result->op   = GGML_OP_DIAG;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5465,7 +5478,7 @@ struct ggml_tensor * ggml_pool_1d(
        is_node = true;
    }
-    const int64_t ne[3] = {
+    const int64_t ne[2] = {
        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
        a->ne[1],
    };
@ -5584,7 +5597,7 @@ struct ggml_tensor * ggml_argsort(
        enum ggml_sort_order  order) {
    bool is_node = false;
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
    ggml_set_op_params_i32(result, 0, (int32_t) order);
@ -5631,7 +5644,7 @@ struct ggml_tensor * ggml_flash_attn(
    }
    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
    int32_t t = masked ? 1 : 0;
    ggml_set_op_params(result, &t, sizeof(t));
@ -5664,7 +5677,7 @@ struct ggml_tensor * ggml_flash_ff(
    }
    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne);
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
    result->op   = GGML_OP_FLASH_FF;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5780,7 +5793,6 @@ struct ggml_tensor * ggml_win_part(
    const int np  = npx*npy;
    const int64_t ne[4] = { a->ne[0], w, w, np, };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    int32_t params[] = { npx, npy, w };
@ -9164,6 +9176,8 @@ static void ggml_compute_forward_norm_f32(
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));
    GGML_ASSERT(eps > 0.0f);
    // TODO: optimize
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -9233,6 +9247,8 @@ static void ggml_compute_forward_rms_norm_f32(
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));
    GGML_ASSERT(eps > 0.0f);
    // TODO: optimize
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -9576,16 +9592,11 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif
 // off1 = offset in i11 and i1
 // cne1 = ne11 and ne1
 // in a normal matrix multiplication, off1 = 0 and cne1 = ne1
 // during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
 static void ggml_compute_forward_mul_mat(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst,
+              struct ggml_tensor * dst) {
              int64_t off1, int64_t cne1) {
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
@ -9653,9 +9664,9 @@ static void ggml_compute_forward_mul_mat(
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;
-                const void  * x = (char *)            src0->data +             i02*nb02 + i03*nb03;
+                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
-                const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
+                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-                      float * d = (float *) ((char *)  dst->data + off1*nb1  + i12*nb2  + i13*nb3);
+                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
                if (type != GGML_TYPE_F32) {
                            float * const wdata    = params->wdata;
@ -9672,7 +9683,7 @@ static void ggml_compute_forward_mul_mat(
                }
                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                         cne1, ne01, ne10,
+                          ne1, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
@ -9688,7 +9699,7 @@ static void ggml_compute_forward_mul_mat(
    if (params->type == GGML_TASK_INIT) {
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
-            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
            assert(params->wsize >= ne11*ne12*ne13*row_size);
            assert(src1->type == GGML_TYPE_F32);
@ -9711,10 +9722,10 @@ static void ggml_compute_forward_mul_mat(
    }
    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-    const int64_t nr0 = ne01;           // src0 rows
+    const int64_t nr0 = ne01;          // src0 rows
-    const int64_t nr1 = cne1*ne12*ne13; // src1 rows
+    const int64_t nr1 = ne1*ne12*ne13; // src1 rows
    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
@ -9756,9 +9767,9 @@ static void ggml_compute_forward_mul_mat(
    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                const int64_t i13 = (ir1/(ne12*cne1));
+                const int64_t i13 = (ir1/(ne12*ne1));
-                const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
+                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
-                const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
+                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
                // broadcast src0 into src1
                const int64_t i03 = i13/r3;
@ -9798,28 +9809,191 @@ static void ggml_compute_forward_mul_mat(
 static void ggml_compute_forward_mul_mat_id(
        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
+        const struct ggml_tensor * ids,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+    const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS
        // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type
        ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
        return;
    }
-    const struct ggml_tensor * ids = src0;
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int ith = params->ith;
    const int nth = params->nth;
    const enum ggml_type type = src0->type;
    const bool src1_cont = ggml_is_contiguous(src1);
    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);
    // broadcast factors
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;
    // row groups
    const int id   = ggml_get_op_params_i32(dst, 0);
    const int n_as = ggml_get_op_params_i32(dst, 1);
-    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+    char * wdata_src1_end = (src1->type == vec_dot_type) ?
-        const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
+            (char *) params->wdata :
            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
-        GGML_ASSERT(row_id >= 0 && row_id < n_as);
+    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]
-        const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
-        ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
+
   if (params->type == GGML_TASK_INIT) {
        char * wdata = params->wdata;
        if (src1->type != vec_dot_type) {
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
            assert(params->wsize >= ne11*ne12*ne13*row_size);
            assert(src1->type == GGML_TYPE_F32);
            for (int64_t i13 = 0; i13 < ne13; ++i13) {
                for (int64_t i12 = 0; i12 < ne12; ++i12) {
                    for (int64_t i11 = 0; i11 < ne11; ++i11) {
                        from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
                        wdata += row_size;
                    }
                }
            }
        }
        // initialize matrix_row_counts
        GGML_ASSERT(wdata == wdata_src1_end);
        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
        // group rows by src0 matrix
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
            GGML_ASSERT(row_id >= 0 && row_id < n_as);
            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
            matrix_row_counts[row_id] += 1;
        }
        return;
    }
    if (params->type == GGML_TASK_FINALIZE) {
        return;
    }
    // compute each matrix multiplication in sequence
    for (int cur_a = 0; cur_a < n_as; ++cur_a) {
        const int64_t cne1 = matrix_row_counts[cur_a];
        if (cne1 == 0) {
            continue;
        }
        const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
        const int64_t nr0 = ne01;           // src0 rows
        const int64_t nr1 = cne1*ne12*ne13; // src1 rows
        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
        // distribute the thread work across the inner or outer loop based on which one is larger
        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
        const int64_t ith0 = ith % nth0;
        const int64_t ith1 = ith / nth0;
        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
        const int64_t ir010 = dr0*ith0;
        const int64_t ir011 = MIN(ir010 + dr0, nr0);
        const int64_t ir110 = dr1*ith1;
        const int64_t ir111 = MIN(ir110 + dr1, nr1);
        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
        // threads with no work simply yield (not sure if it helps)
        if (ir010 >= ir011 || ir110 >= ir111) {
            sched_yield();
            continue;
        }
        assert(ne12 % ne02 == 0);
        assert(ne13 % ne03 == 0);
        // block-tiling attempt
        const int64_t blck_0 = 16;
        const int64_t blck_1 = 16;
        // attempt to reduce false-sharing (does not seem to make a difference)
        float tmp[16];
        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
                    // broadcast src0 into src1
                    const int64_t i03 = i13/r3;
                    const int64_t i02 = i12/r2;
                    const int64_t i1 = i11;
                    const int64_t i2 = i12;
                    const int64_t i3 = i13;
                    const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03);
                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
                    //       the original src1 data pointer, so we should index using the indices directly
                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
                    const char * src1_col = (const char *) wdata +
                        (src1_cont || src1->type != vec_dot_type
                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
                        : (i11*nb11 + i12*nb12 + i13*nb13));
                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                    //}
                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                        vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
                    }
                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
                }
            }
        }
    }
    #undef MMID_MATRIX_ROW
 }
 // ggml_compute_forward_out_prod
@ -11400,10 +11574,13 @@ static void ggml_compute_forward_rope_f32(
                    }
                } else {
                    // TODO: this might be wrong for ne0 != n_dims - need double check
-                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
                    theta_base *= freq_scale;
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                        if (ic < n_dims) {
                            const int64_t ib = 0;
                            // simplified from `(ib * n_dims + ic) * inv_ndims`
                            float cur_rot = inv_ndims * ic - ib;
@ -11426,6 +11603,14 @@ static void ggml_compute_forward_rope_f32(
                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
                        } else {
                            const int64_t i0 = ic;
                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                            dst_data[0] = src[0];
                            dst_data[1] = src[1];
                        }
                    }
                }
@ -11553,10 +11738,13 @@ static void ggml_compute_forward_rope_f16(
                    }
                } else {
                    // TODO: this might be wrong for ne0 != n_dims - need double check
-                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
+                    //       it seems we have to rope just the first n_dims elements and do nothing with the rest
                    // ref:  https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
                    theta_base *= freq_scale;
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
+                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
+                        if (ic < n_dims) {
                            const int64_t ib = 0;
                            // simplified from `(ib * n_dims + ic) * inv_ndims`
                            float cur_rot = inv_ndims * ic - ib;
@ -11579,6 +11767,14 @@ static void ggml_compute_forward_rope_f16(
                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                        } else {
                            const int64_t i0 = ic;
                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
                            dst_data[0] = src[0];
                            dst_data[1] = src[1];
                        }
                    }
                }
@ -14187,7 +14383,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            } break;
        case GGML_OP_MUL_MAT:
            {
-                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
+                ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
@ -14563,7 +14759,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
        return replacements->vals[i];
    }
-    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne);
+    struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne);
    // insert clone into replacements
    GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite
@ -15987,7 +16183,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
                // FIXME: blas
                n_tasks = n_threads;
            } break;
        case GGML_OP_OUT_PROD:
@ -16316,25 +16511,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                    } else
 #endif
                    if (node->src[1]->type != vec_dot_type) {
-                        cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
+                        cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
                    }
                } break;
            case GGML_OP_MUL_MAT_ID:
                {
-                    const struct ggml_tensor * a = node->src[2];
+                    const struct ggml_tensor * src0 = node->src[2];
-                    const struct ggml_tensor * b = node->src[1];
+                    const struct ggml_tensor * src1 = node->src[1];
-                    const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
+                    const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                    if (src1->type != vec_dot_type) {
-                    if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
+                        cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
                        if (a->type != GGML_TYPE_F32) {
                            // here we need memory just for single 2D matrix from src0
                            cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
                        }
                    } else
 #endif
                    if (b->type != vec_dot_type) {
                        cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
                    }
                    const int n_as = ggml_get_op_params_i32(node, 1);
                    cur = GGML_PAD(cur, sizeof(int64_t));        // align
                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
                } break;
            case GGML_OP_OUT_PROD:
                {
@ -16564,7 +16755,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
    fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
            ggml_type_name(tensor->type),
            ggml_op_name  (tensor->op),
-            tensor->n_dims,
+            ggml_n_dims(tensor),
            ne[0], ne[1], ne[2], ne[3],
            nb[0], nb[1], nb[2], nb[3],
            tensor->data,
@ -16579,7 +16770,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
            arg,
            ggml_type_name(tensor->type),
            ggml_op_name  (tensor->op),
-            tensor->n_dims,
+            ggml_n_dims(tensor),
            ne[0], ne[1], ne[2], ne[3],
            nb[0], nb[1], nb[2], nb[3],
            tensor->data,
@ -16669,11 +16860,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                const uint32_t type   = tensor->type;
                const uint32_t op     = tensor->op;
                const uint32_t n_dims = tensor->n_dims;
                fwrite(&type,   sizeof(uint32_t), 1, fout);
                fwrite(&op,     sizeof(uint32_t), 1, fout);
                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    const uint64_t ne = tensor->ne[j];
@ -16703,11 +16892,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                const uint32_t type   = tensor->type;
                const uint32_t op     = tensor->op;
                const uint32_t n_dims = tensor->n_dims;
                fwrite(&type,   sizeof(uint32_t), 1, fout);
                fwrite(&op,     sizeof(uint32_t), 1, fout);
                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                    const uint64_t ne = tensor->ne[j];
@ -16879,12 +17066,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
        {
            uint32_t type;
            uint32_t op;
            uint32_t n_dims;
            for (uint32_t i = 0; i < n_leafs; ++i) {
                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
                int64_t ne[GGML_MAX_DIMS];
                size_t  nb[GGML_MAX_DIMS];
@ -16900,7 +17085,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
                    nb[j] = nb_cur;
                }
-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
                tensor->op = (enum ggml_op) op;
@ -16917,7 +17102,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
                ptr += ggml_nbytes(tensor);
-                fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+                fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }
@ -16927,12 +17112,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
        {
            uint32_t type;
            uint32_t op;
            uint32_t n_dims;
            for (uint32_t i = 0; i < n_nodes; ++i) {
                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
                enum ggml_op eop = (enum ggml_op) op;
@ -17003,7 +17186,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
                        } break;
                    default:
                        {
-                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+                            tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne);
                            tensor->op = eop;
                        } break;
@ -17022,7 +17205,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
                result->nodes[i] = tensor;
-                fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+                fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }
    }
@ -17160,7 +17343,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
            fprintf(fp, "(%s)|", ggml_type_name(node->type));
        }
-        if (node->n_dims == 2) {
+        if (ggml_is_matrix(node)) {
            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
        } else {
            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
@ -17427,7 +17610,7 @@ static enum ggml_opt_result ggml_opt_adam(
            int64_t i = 0;
            for (int p = 0; p < np; ++p) {
                const int64_t ne = ggml_nelements(ps[p]);
-                const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched;
+                const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched;
                for (int64_t j = 0; j < ne; ++j) {
                    float x  = ggml_get_f32_1d(ps[p], j);
                    float g_ = g[i]*gnorm;
@ -18701,7 +18884,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                return NULL;
            }
-            const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
+            const size_t size_cur = ggml_row_size(info->type, ne);
            ctx->size += GGML_PAD(size_cur, ctx->alignment);
        }
@ -19205,8 +19388,8 @@ void gguf_add_tensor(
        ctx->infos[idx].ne[i] = 1;
    }
-    ctx->infos[idx].n_dims = tensor->n_dims;
+    ctx->infos[idx].n_dims = ggml_n_dims(tensor);
-    for (int i = 0; i < tensor->n_dims; i++) {
+    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
        ctx->infos[idx].ne[i] = tensor->ne[i];
    }
--- a/ggml.h
+++ b/ggml.h
@ -303,7 +303,7 @@ extern "C" {
 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@ -343,6 +343,12 @@ extern "C" {
        GGML_TYPE_COUNT,
    };
    // precision
    enum ggml_prec {
        GGML_PREC_DEFAULT,
        GGML_PREC_F32,
    };
    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
@ -502,7 +508,6 @@ extern "C" {
        struct ggml_backend_buffer * buffer;
        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                                   // nb[0] = ggml_type_size(type)
@ -534,7 +539,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[12];
+        char padding[8];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -639,7 +644,6 @@ extern "C" {
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
    GGML_API int    ggml_blck_size(enum ggml_type type);
    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
@ -666,6 +670,11 @@ extern "C" {
    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@ -1054,6 +1063,12 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // change the precision of a matrix multiplication
    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
    GGML_API void ggml_mul_mat_set_prec(
            struct ggml_tensor * a,
            enum ggml_prec       prec);
    // indirect matrix multiplication
    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -95,6 +95,7 @@ class MODEL_ARCH(IntEnum):
    BLOOM     = auto()
    STABLELM  = auto()
    QWEN      = auto()
    PHI2      = auto()
 class MODEL_TENSOR(IntEnum):
@ -140,6 +141,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.BLOOM:          "bloom",
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
    MODEL_ARCH.PHI2:           "phi2",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -350,6 +352,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    MODEL_ARCH.GPT2: [
        # TODO
    ],
    MODEL_ARCH.PHI2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ]
    # TODO
 }
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -17,6 +17,7 @@ class TensorNameMap:
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert
            "language_model.embedding.word_embeddings",  # persimmon
            "transformer.embd.wte",                      # phi2
        ),
        # Token type embeddings
@ -41,6 +42,7 @@ class TensorNameMap:
            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen
            "output",                    # llama-pth bloom
            "word_embeddings_for_head",  # persimmon
            "lm_head.linear",            # phi2
        ),
        # Output norm
@ -53,6 +55,7 @@ class TensorNameMap:
            "transformer.norm_f",                      # mpt
            "ln_f",                                    # refact bloom qwen
            "language_model.encoder.final_layernorm",  # persimmon
            "lm_head.ln",                              # phi2
        ),
        # Rope frequencies
@ -75,6 +78,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
            "transformer.h.{bid}.ln",                               # phi2
        ),
        # Attention norm 2
@ -90,6 +94,7 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
        ),
        # Attention query
@ -128,6 +133,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.dense",                # bert
            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
            "transformer.h.{bid}.mixer.out_proj",                        # phi2
        ),
        # Rotary embeddings
@ -167,6 +173,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
            "transformer.h.{bid}.mlp.w1",                             # qwen
            "transformer.h.{bid}.mlp.fc1",                            # phi2
        ),
        MODEL_TENSOR.FFN_UP_EXP: (
@ -198,6 +205,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.output.dense",                       # bert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
            "transformer.h.{bid}.mlp.fc2",                            # phi2
        ),
        MODEL_TENSOR.FFN_DOWN_EXP: (
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@ -109,8 +109,10 @@ class SpecialVocab:
        return True
    def _set_special_token(self, typ: str, tid: Any) -> None:
-        if not isinstance(tid, int) or tid < 0:
+        if not isinstance(tid, int):
            return
        if tid < 0:
            raise ValueError(f'invalid value for special token type {typ}: {tid}')
        if self.n_vocab is None or tid < self.n_vocab:
            if typ in self.special_token_ids:
                return
--- a/llama.cpp
+++ b/llama.cpp
@ -195,6 +195,7 @@ enum llm_arch {
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
    LLM_ARCH_QWEN,
    LLM_ARCH_PHI2,
    LLM_ARCH_UNKNOWN,
 };
@ -212,6 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
    { LLM_ARCH_BLOOM,           "bloom"     },
    { LLM_ARCH_STABLELM,        "stablelm"  },
    { LLM_ARCH_QWEN,            "qwen"      },
    { LLM_ARCH_PHI2,            "phi2"      },
 };
 enum llm_kv {
@ -550,6 +552,19 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_PHI2,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_UNKNOWN,
@ -1420,6 +1435,7 @@ struct llama_model {
    struct ggml_tensor * output_norm;
    struct ggml_tensor * output_norm_b;
    struct ggml_tensor * output;
    struct ggml_tensor * output_b;
    std::vector<llama_layer> layers;
@ -1505,6 +1521,10 @@ struct llama_context {
    // decode output (2-dimensional array: [n_tokens][n_vocab])
    std::vector<float> logits;
 #ifndef NDEBUG
    // guard against access to unset logits
    std::vector<bool>  logits_valid;
 #endif
    bool logits_all = false;
    // input embedding (1-dimensional array: [n_embd])
@ -1933,7 +1953,7 @@ namespace GGUFMeta {
                target = override->bool_value;
                return true;
            }
-            return true;
+            return false;
        }
        template<typename OT>
@ -2397,25 +2417,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
    switch (ftype) {
        case LLAMA_FTYPE_ALL_F32:     return "all F32";
-        case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
+        case LLAMA_FTYPE_MOSTLY_F16:  return "F16";
-        case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
-        case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
+        case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-                                      return "mostly Q4_1, some F16";
+                                      return "Q4_1, some F16";
-        case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
-        case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
-        case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
+        case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
        // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "mostly Q2_K";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "mostly Q6_K";
+        case LLAMA_FTYPE_MOSTLY_Q6_K:   return "Q6_K";
        default: return "unknown, may not work";
    }
@ -2533,6 +2553,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 22: model.type = e_model::MODEL_1B; break;
                    case 26: model.type = e_model::MODEL_3B; break;
                    case 32: model.type = e_model::MODEL_7B; break;
                    case 40: model.type = e_model::MODEL_13B; break;
@ -2634,6 +2655,15 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
        case LLM_ARCH_PHI2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                switch (hparams.n_layer) {
                    case 32: model.type = e_model::MODEL_3B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
        default: (void)0;
    }
@ -2989,7 +3019,7 @@ static bool llm_load_tensors(
    (void) main_gpu;
-    enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
+    enum ggml_backend_type llama_backend_offload       = GGML_BACKEND_CPU;
    enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
 #ifdef GGML_USE_CUBLAS
@ -3632,7 +3662,73 @@ static bool llm_load_tensors(
                        }
                    }
                } break;
            case LLM_ARCH_PHI2:
                {
                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                    // output
                    {
                        ggml_backend_type backend_norm;
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            backend_norm   = llama_backend_offload;
                            backend_output = llama_backend_offload;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
                        }
                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                        model.output_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab},         backend_output);
                        if (backend_norm == GGML_BACKEND_GPU) {
                            vram_weights += ggml_nbytes(model.output_norm);
                            vram_weights += ggml_nbytes(model.output_norm_b);
                            vram_weights += ggml_nbytes(model.output);
                            vram_weights += ggml_nbytes(model.output_b);
                        }
                    }
                    const uint32_t n_ff = hparams.n_ff;
                    const int i_gpu_start = n_layer - n_gpu_layers;
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},         backend);
                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i),   {n_ff},         backend);
                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
                                ggml_nbytes(layer.wqkv)      + ggml_nbytes(layer.bqkv)        +
                                ggml_nbytes(layer.wo)        + ggml_nbytes(layer.bo)          +
                                ggml_nbytes(layer.ffn_up)    + ggml_nbytes(layer.ffn_up_b)    +
                                ggml_nbytes(layer.ffn_down)  + ggml_nbytes(layer.ffn_down_b);
                        }
                    }
                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -3998,6 +4094,7 @@ static struct ggml_tensor * llm_build_ffn(
 // if max_alibi_bias > 0 then apply ALiBi
 static struct ggml_tensor * llm_build_kqv(
        struct ggml_context * ctx,
          const llama_model & model,
        const llama_hparams & hparams,
       const llama_kv_cache & kv,
         struct ggml_tensor * wo,
@ -4009,6 +4106,7 @@ static struct ggml_tensor * llm_build_kqv(
                    int32_t   n_tokens,
                    int32_t   n_kv,
                    float     max_alibi_bias,
                    float     scale,
         const llm_build_cb & cb,
                    int       il) {
    const int64_t n_embd      = hparams.n_embd;
@ -4031,6 +4129,12 @@ static struct ggml_tensor * llm_build_kqv(
    struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
    cb(kq, "kq", il);
    if (model.arch == LLM_ARCH_PHI2) {
        // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
        // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
    }
    if (max_alibi_bias > 0.0f) {
        // temporary branch until we figure out how to handle ggml_alibi through ggml_add
        kq = ggml_scale(ctx, kq, kq_scale);
@ -4050,7 +4154,7 @@ static struct ggml_tensor * llm_build_kqv(
        kq = ggml_soft_max(ctx, kq);
        cb(kq, "kq_soft_max", il);
    } else {
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, scale);
        cb(kq, "kq_soft_max_ext", il);
    }
@ -4257,9 +4361,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4440,9 +4544,9 @@ struct llm_build_context {
                // apply ALiBi for 13B model
                const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4564,9 +4668,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4664,9 +4768,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4873,9 +4977,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
                // TODO: not tested, could be broken
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                        Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -4964,9 +5068,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5061,9 +5165,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5155,9 +5259,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5268,9 +5372,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5327,15 +5431,15 @@ struct llm_build_context {
        cb(inpL, "inp_embd", -1);
        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
        cb(inp_pos, "inp_pos", -1);
        // KQ_scale
-        struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
        cb(KQ_scale, "KQ_scale", -1);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
        cb(KQ_mask, "KQ_mask", -1);
        // shift the entire K-cache if needed
@ -5385,9 +5489,9 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, NULL,
-                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
+                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5429,6 +5533,122 @@ struct llm_build_context {
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
    struct ggml_cgraph * build_phi2() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
        struct ggml_tensor * cur;
        struct ggml_tensor * attn_norm_output;
        struct ggml_tensor * ffn_output;
        struct ggml_tensor * inpL;
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
        cb(inpL, "inp_embd", -1);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
        cb(inp_pos, "inp_pos", -1);
        // Q_scale
        struct ggml_tensor * Q_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
        cb(Q_scale, "Q_scale", -1);
        // KQ_scale
        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
        cb(KQ_scale, "KQ_scale", -1);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
        cb(KQ_mask, "KQ_mask", -1);
        // shift the entire K-cache if needed
        if (do_rope_shift) {
            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
        }
        for (int il = 0; il < n_layer; ++il) {
            attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
                    model.layers[il].attn_norm,
                    model.layers[il].attn_norm_b,
                    LLM_NORM, cb, il);
            cb(attn_norm_output, "attn_norm", il);
            // self-attention
            {
                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
                cb(cur, "wqkv", il);
                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
                cb(cur, "bqkv", il);
                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                Qcur = ggml_rope_custom(
                    ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Qcur, "Qcur", il);
                Qcur = ggml_scale(ctx0, Qcur, Q_scale);
                cb(Qcur, "Qcur", il);
                Kcur = ggml_rope_custom(
                    ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Kcur, "Kcur", il);
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
                cb(cur, "kqv_out", il);
            }
            // FF
            {
                ffn_output = llm_build_ffn(ctx0, attn_norm_output,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                        NULL,                      NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(ffn_output, "ffn_out", il);
            }
            cur = ggml_add(ctx0, cur, ffn_output);
            cb(cur, "l_out", il);
            cur = ggml_add(ctx0, cur, inpL);
            cb(cur, "l_out", il);
            inpL = cur;
        }
        cur = llm_build_norm(ctx0, inpL, hparams,
                model.output_norm,
                model.output_norm_b,
                LLM_NORM, cb, -1);
        cb(cur, "result_norm", -1);
        cur = ggml_mul_mat(ctx0, model.output, cur);
        cb(cur, "result_output_no_bias", -1);
        cur = ggml_add(ctx0, cur, model.output_b);
        cb(cur, "result_output", -1);
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
 };
@ -5444,7 +5664,7 @@ enum llm_offload_func_e {
    OFFLOAD_FUNC_FRC, // force offload
    OFFLOAD_FUNC_KQV,
    OFFLOAD_FUNC_NR,
-    OFFLOAD_FUNC_EMB,
+    OFFLOAD_FUNC_EMB, // embeddings
    OFFLOAD_FUNC_OUT,
 };
@ -5529,6 +5749,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
    { "pos_embd",                   OFFLOAD_FUNC_NR  },
    { "inp_pos",                    OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
    { "Q_scale",                    OFFLOAD_FUNC_FRC },
    { "KQ_scale",                   OFFLOAD_FUNC_FRC },
    { "KQ_mask",                    OFFLOAD_FUNC_FRC },
    { "K_shift",                    OFFLOAD_FUNC_FRC },
@ -5613,6 +5834,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
    { "l_out",                      OFFLOAD_FUNC     },
    { "result_norm",                OFFLOAD_FUNC_EMB },
    { "result_output_no_bias",      OFFLOAD_FUNC_EMB },
    { "result_output",              OFFLOAD_FUNC_OUT },
 };
@ -5630,6 +5852,7 @@ static struct ggml_cgraph * llama_build_graph(
    bool alloc_inp_tokens   = false;
    bool alloc_inp_embd     = false;
    bool alloc_inp_pos      = false;
    bool alloc_inp_Q_scale  = false;
    bool alloc_inp_KQ_scale = false;
    bool alloc_inp_KQ_mask  = false;
    bool alloc_inp_K_shift  = false;
@ -5697,7 +5920,7 @@ static struct ggml_cgraph * llama_build_graph(
            alloc_inp_pos = true;
        }
-        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
+        if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) {
            ggml_allocr_alloc(lctx.alloc, cur);
            if (!ggml_allocr_is_measure(lctx.alloc)) {
@ -5705,6 +5928,23 @@ static struct ggml_cgraph * llama_build_graph(
                ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
            }
            alloc_inp_Q_scale = true;
        }
        if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) {
            ggml_allocr_alloc(lctx.alloc, cur);
            if (!ggml_allocr_is_measure(lctx.alloc)) {
                const int64_t n_embd_head = model.hparams.n_embd_head();
                if (model.arch == LLM_ARCH_PHI2) {
                    // with phi2, we scale the Q to avoid precision issues
                    // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
                    ggml_set_f32(cur, 1.0f);
                } else {
                    ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head)));
                }
            }
            alloc_inp_KQ_scale = true;
        }
@ -5929,6 +6169,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_qwen();
            } break;
        case LLM_ARCH_PHI2:
            {
                result = llm.build_phi2();
            } break;
        default:
            GGML_ASSERT(false);
    }
@ -6062,12 +6306,16 @@ static int llama_decode_internal(
    ggml_allocr_alloc_graph(lctx.alloc, gf);
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
+    // the output is always the last tensor in the graph
    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
    GGML_ASSERT(strcmp(res->name, "result_output") == 0);
    // the embeddings could be the second to last tensor, or the third to last tensor
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
-
+    if (strcmp(embeddings->name, "result_norm") != 0) {
-    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
+        embeddings = gf->nodes[gf->n_nodes - 3];
-    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
+        GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
-
+    }
 #ifdef GGML_USE_CUBLAS
    for (int i = 0; i < gf->n_leafs; i++) {
@ -6162,6 +6410,14 @@ static int llama_decode_internal(
    {
        auto & logits_out = lctx.logits;
 #ifndef NDEBUG
        auto & logits_valid = lctx.logits_valid;
        logits_valid.clear();
        logits_valid.resize(n_tokens);
        logits_out.clear();
 #endif
        if (batch.logits) {
            logits_out.resize(n_vocab * n_tokens);
            for (uint32_t i = 0; i < n_tokens; i++) {
@ -6169,13 +6425,22 @@ static int llama_decode_internal(
                    continue;
                }
                memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
 #ifndef NDEBUG
                logits_valid[i] = true;
 #endif
            }
        } else if (lctx.logits_all) {
            logits_out.resize(n_vocab * n_tokens);
            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
 #ifndef NDEBUG
            std::fill(logits_valid.begin(), logits_valid.end(), true);
 #endif
        } else {
            logits_out.resize(n_vocab);
            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
 #ifndef NDEBUG
            logits_valid[0] = true;
 #endif
        }
    }
@ -8483,7 +8748,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
        // quantize only 2D tensors
-        quantize &= (tensor->n_dims == 2);
+        quantize &= (ggml_n_dims(tensor) == 2);
        quantize &= params->quantize_output_tensor || name != "output.weight";
        quantize &= !params->only_copy;
@ -8638,53 +8903,60 @@ static int llama_apply_lora_from_file_internal(
    const int64_t t_start_lora_us = ggml_time_us();
-    auto fin = std::ifstream(path_lora, std::ios::binary);
+    llama_file fin(path_lora, "rb");
    if (!fin) {
        LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
        return 1;
    }
    // verify magic and version
    {
-        uint32_t magic;
+        uint32_t magic = fin.read_u32();
-        fin.read((char *) &magic, sizeof(magic));
+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
-        uint32_t format_version;
+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
-        fin.read((char *) &format_version, sizeof(format_version));
+            return 1;
        }
        uint32_t format_version = fin.read_u32();
        if (format_version != 1) {
            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
            return 1;
        }
    }
-    int32_t lora_r;
+    int32_t lora_r = fin.read_u32();
-    int32_t lora_alpha;
+    int32_t lora_alpha = fin.read_u32();
    fin.read((char *) &lora_r, sizeof(lora_r));
    fin.read((char *) &lora_alpha, sizeof(lora_alpha));
    float scaling = scale * (float)lora_alpha / (float)lora_r;
    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
    // create a name -> tensor map of the model to accelerate lookups
    // find the max tensor size to estimate the required temporary buffer size
    size_t max_tensor_size = 0;
    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
    for (const auto & kv : model.tensors_by_name) {
        model_tensors.insert(kv);
        size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
        max_tensor_size = std::max(max_tensor_size, f32_size);
    }
    // create a temporary ggml context to store the lora tensors
-    // todo: calculate size from biggest possible tensor
+    // TODO: use ggml-alloc
-    std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
+    size_t lora_ctx_size = max_tensor_size * 3;
    LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
    std::vector<uint8_t> lora_buf(lora_ctx_size);
    struct ggml_init_params params;
    params.mem_size   = lora_buf.size();
    params.mem_buffer = lora_buf.data();
    params.no_alloc   = false;
-    ggml_context * lora_ctx = ggml_init(params);
+    using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
-    // create a name -> tensor map of the model to accelerate lookups
+    unique_context lora_ctx(nullptr, ggml_free);
-    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
+    lora_ctx.reset(ggml_init(params));
-    for (const auto & kv : model.tensors_by_name) {
+    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
        model_tensors.insert(kv);
    }
    // load base model
    std::unique_ptr<llama_model_loader> ml;
-    ggml_context * base_ctx = NULL;
+
    unique_context base_ctx(nullptr, ggml_free);
    std::vector<uint8_t> base_buf;
    if (path_base_model) {
        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
@ -8693,6 +8965,7 @@ static int llama_apply_lora_from_file_internal(
        size_t ctx_size;
        size_t mmapped_size;
        ml->calc_sizes(ctx_size, mmapped_size);
        base_buf.resize(ctx_size);
        ggml_init_params base_params;
@ -8700,9 +8973,9 @@ static int llama_apply_lora_from_file_internal(
        base_params.mem_buffer = base_buf.data();
        base_params.no_alloc   = ml->use_mmap;
-        base_ctx = ggml_init(base_params);
+        base_ctx.reset(ggml_init(base_params));
-        // maybe this should in llama_model_loader
+        // maybe this should be in llama_model_loader
        if (ml->use_mmap) {
            ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
        }
@ -8715,27 +8988,35 @@ static int llama_apply_lora_from_file_internal(
    std::vector<uint8_t> work_buffer;
    while (true) {
        if (fin.tell() == fin.size) {
            // eof
            break;
        }
        int32_t n_dims;
-        int32_t length;
+        int32_t name_len;
        int32_t ftype;
-        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        fin.read_raw(&n_dims, sizeof(n_dims));
-        fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+        fin.read_raw(&name_len, sizeof(name_len));
-        fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+        fin.read_raw(&ftype,  sizeof(ftype));
-        if (fin.eof()) {
+
-            break;
+        if (n_dims != 1 && n_dims != 2) {
            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
            return 1;
        }
        int32_t ne[2] = { 1, 1 };
        for (int i = 0; i < n_dims; ++i) {
-            fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+            fin.read_raw(&ne[i], sizeof(ne[i]));
        }
        std::string name;
        {
            GGML_ASSERT(name_len <= 1024);
            char buf[1024];
-            fin.read(buf, length);
+            fin.read_raw(buf, name_len);
-            name = std::string(buf, length);
+            name = std::string(buf, name_len);
        }
        // check for lora suffix and get the type of tensor
@ -8749,7 +9030,7 @@ static int llama_apply_lora_from_file_internal(
        std::string lora_type = name.substr(pos + lora_suffix.length());
        std::string base_name = name;
        base_name.erase(pos);
-        // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
+        // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
        if (model_tensors.find(base_name) == model_tensors.end()) {
            LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
@ -8768,22 +9049,15 @@ static int llama_apply_lora_from_file_internal(
                        return false;
                    }
        }
-        ggml_tensor * lora_tensor;
+        ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
-        if (n_dims == 2) {
+        ggml_set_name(lora_tensor, name.c_str());
            lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
        }
        else {
            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
            return 1;
        }
        ggml_set_name(lora_tensor, "lora_tensor");
        // load tensor data
-        size_t offset = fin.tellg();
+        size_t offset = fin.tell();
        size_t tensor_data_size = ggml_nbytes(lora_tensor);
        offset = (offset + 31) & -32;
-        fin.seekg(offset);
+        fin.seek(offset, SEEK_SET);
-        fin.read((char*)lora_tensor->data, tensor_data_size);
+        fin.read_raw(lora_tensor->data, tensor_data_size);
        lora_tensors[name] = lora_tensor;
@ -8813,13 +9087,11 @@ static int llama_apply_lora_from_file_internal(
                // load from base model
                if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
                    // TODO: throw
                    LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
                    return 1;
                }
-                // TODO: not tested!! maybe not working!
+                base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU);
                base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
                ml->load_data_for(base_t);
            } else {
                base_t = dest_t;
@ -8848,43 +9120,45 @@ static int llama_apply_lora_from_file_internal(
            }
            // w = w + BA*s
-            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
+            ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
            offload_func(BA);
            ggml_set_name(BA, "BA");
            if (scaling != 1.0f) {
-                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
+                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling);
                ggml_set_name(scale_tensor, "scale_tensor");
-                BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
+                BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor);
                offload_func(BA);
                ggml_set_name(BA, "BA_scaled");
            }
            ggml_tensor * r;
            if (base_t == dest_t) {
-                r = ggml_add_inplace(lora_ctx, dest_t, BA);
+                r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
                offload_func_force_inplace(r);
                ggml_set_name(r, "r_add_inplace");
            }
            else {
-                r = ggml_add(lora_ctx, base_t, BA);
+                r = ggml_add(lora_ctx.get(), base_t, BA);
                offload_func(r);
                ggml_set_name(r, "r_add");
-                r = ggml_cpy(lora_ctx, r, dest_t);
+                r = ggml_cpy(lora_ctx.get(), r, dest_t);
                offload_func(r);
                ggml_set_name(r, "r_cpy");
            }
-            struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
+            struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
            ggml_build_forward_expand(gf, r);
            ggml_graph_compute_helper(work_buffer, gf, n_threads);
            // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
            GGML_ASSERT(lora_tensors.size() == 2);
            // we won't need these tensors again, reset the context to save memory
-            ggml_free(lora_ctx);
+            lora_ctx.reset(ggml_init(params));
            lora_ctx = ggml_init(params);
            lora_tensors.clear();
            n_tensors++;
@ -8894,12 +9168,6 @@ static int llama_apply_lora_from_file_internal(
        }
    }
    // TODO: this should be in a destructor, it will leak on failure
    ggml_free(lora_ctx);
    if (base_ctx) {
        ggml_free(base_ctx);
    }
    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
@ -10071,6 +10339,7 @@ float * llama_get_logits(struct llama_context * ctx) {
 }
 float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
    assert(ctx->logits_valid.at(i));
    return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
 }
--- a/llama.h
+++ b/llama.h
@ -39,6 +39,7 @@
 #define LLAMA_MAX_RNG_STATE (64*1024)
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -54,7 +54,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector<uint8_t> dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type));
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
        int64_t hist[16];
        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
@ -72,6 +72,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
    size_t bs = ggml_blck_size(t->type);
    std::vector<float> vq(ggml_blck_size(t->type));
    bool quantized = ggml_is_quantized(t->type);
    // access elements by index to avoid gaps in views
    for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
@ -85,9 +87,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
                        tv.push_back(*(float *) &buf[i]);
                    } else if (t->type == GGML_TYPE_I32) {
                        tv.push_back((float)*(int32_t *) &buf[i]);
-                    } else if (ggml_is_quantized(t->type)) {
+                    } else if (quantized) {
-                        std::vector<float> vq(ggml_blck_size(t->type));
+                        tt.to_float(&buf[i], vq.data(), bs);
                        tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
                        GGML_ASSERT(false);
@ -1554,6 +1555,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512)); // neox (falcon 40B)
        test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512)); // neox (falcon 40B)
        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512)); // neox (stablelm)
        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512)); // neox (phi-2)
    }
    test_cases.emplace_back(new test_alibi());
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -286,7 +286,7 @@ int main(int argc, char * argv[]) {
                        qfns.from_float_reference(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -300,7 +300,7 @@ int main(int argc, char * argv[]) {
                        qfns.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -315,7 +315,7 @@ int main(int argc, char * argv[]) {
                        qfns.to_float(test_q1, test_out, size);
                        return test_out[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -330,7 +330,7 @@ int main(int argc, char * argv[]) {
                        vdot.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -347,7 +347,7 @@ int main(int argc, char * argv[]) {
                        qfns.vec_dot(size, &result, test_q1, test_q2);
                        return result;
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");