Merge branch 'gguf' of https://github.com/ggerganov/llama.cpp into gguf

2023-08-17 04:55:26 +02:00 · 2023-08-17 04:55:26 +02:00 · d864596e0a
commit d864596e0a
parent c545d85f83 5ec18934ad
29 changed files with 2506 additions and 8197 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -529,7 +529,6 @@ endif()
 add_library(llama
            llama.cpp
            llama.h
            llama-util.h
            )
 target_include_directories(llama PUBLIC .)
--- a/12
+++ b/12
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gguf-llama-simple gptneox-main
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf gptneox-main
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@ -329,10 +329,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 OBJS += ggml-alloc.o
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 gguf-llama.o: gguf-llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h gguf-llama.h gguf-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common.o: examples/common.cpp examples/common.h
@ -388,10 +385,7 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
-gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o gguf-llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gguf-llama-simple: examples/gguf/gguf-llama-simple.cpp                            build-info.h ggml.o gguf-llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gptneox-main: gptneox-main.cpp ggml.o $(OBJS)
--- a/constants.py
+++ b/constants.py
@ -1,50 +0,0 @@
 GGUF_MAGIC             = 0x47475546
 GGUF_VERSION           = 1
 GGUF_DEFAULT_ALIGNMENT = 32
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
 KEY_GENERAL_ALIGNMENT            = "general.alignment"
 KEY_GENERAL_NAME                 = "general.name"
 KEY_GENERAL_AUTHOR               = "general.author"
 KEY_GENERAL_URL                  = "general.url"
 KEY_GENERAL_DESCRIPTION          = "general.description"
 KEY_GENERAL_FILE_TYPE            = "general.file_type"
 KEY_GENERAL_LICENSE              = "general.license"
 KEY_GENERAL_SOURCE_URL           = "general.source.url"
 KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
 # LLM
 KEY_LLM_CONTEXT_LENGTH           = "{llm}.context_length"
 KEY_LLM_EMBEDDING_LENGTH         = "{llm}.embedding_length"
 KEY_LLM_BLOCK_COUNT              = "{llm}.block_count"
 KEY_LLM_FEED_FORWARD_LENGTH      = "{llm}.feed_forward_length"
 KEY_LLM_USE_PARALLEL_RESIDUAL    = "{llm}.use_parallel_residual"
 KEY_LLM_TENSOR_DATA_LAYOUT       = "{llm}.tensor_data_layout"
 # attention
 KEY_ATTENTION_HEAD_COUNT         = "{llm}.attention.head_count"
 KEY_ATTENTION_HEAD_COUNT_KV      = "{llm}.attention.head_count_kv"
 KEY_ATTENTION_MAX_ALIBI_BIAS     = "{llm}.attention.max_alibi_bias"
 KEY_ATTENTION_CLAMP_KQV          = "{llm}.attention.clamp_kqv"
 KEY_ATTENTION_LAYERNORM_EPS      = "{llm}.attention.layer_norm_epsilon"
 KEY_ATTENTION_LAYERNORM_RMS_EPS  = "{llm}.attention.layer_norm_rms_epsilon"
 # RoPE
 KEY_ROPE_DIMENSION_COUNT         = "{llm}.rope.dimension_count"
 KEY_ROPE_SCALE                   = "{llm}.rope.scale"
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
 KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
 KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
 KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
 KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
 KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
 KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
 KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
 KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
 KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
 KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
 KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
--- a/convert-gptneox-h5-to-gguf.py
+++ b/convert-gptneox-h5-to-gguf.py
@ -1,15 +1,15 @@
 # HF gptneox--> gguf conversion
 import gguf
 import gguf_namemap as tmap
 import os
 import sys
 import struct
 import json
 import numpy as np
 import torch
 from typing import Any, List
 from pathlib import Path
 import torch
 from transformers import AutoTokenizer
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@ -188,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
 # TENSORS
-tensor_map = tmap.get_tensor_namemap(block_count)
+tensor_map = gguf.get_tensor_name_map(block_count)
 # tensor info
 print("gguf: get tensor metadata")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@ -3,18 +3,17 @@
 # HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
 import gguf
 import gguf_namemap as tmap
 import os
 import sys
 import struct
 import json
 import numpy as np
 import torch
 from typing import Any, List
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
 #NDArray = np.ndarray[Any, Any]
 # compatible with python < 3.9
 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@ -96,6 +95,7 @@ gguf_writer.add_architecture(llm_arch)
 gguf_writer.add_name(last_dir)
 gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
 gguf_writer.add_source_hf_repo(hf_repo)
 gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth")
 gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
 gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
 gguf_writer.add_block_count(llm_arch, block_count)
@ -188,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
 # TENSORS
-tensor_map = tmap.get_tensor_namemap(block_count)
+tensor_map = gguf.get_tensor_name_map(block_count)
 # tensor info
 print("gguf: get tensor metadata")
@ -260,7 +260,6 @@ for part_name in part_names:
    for name in model_part.keys():
        data = model_part[name]
        old_dtype = data.dtype
        # we don't need these
--- a/convert-llama-h5-to-gguf.py
+++ b/convert-llama-h5-to-gguf.py
@ -1,8 +1,6 @@
 # HF llama --> gguf conversion
 import gguf
 import gguf_namemap as tmap
 import os
 import sys
 import struct
@ -18,7 +16,9 @@ from sentencepiece import SentencePieceProcessor
 # compatible with python < 3.9
 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
-def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+# reverse HF permute back to original pth layout
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
 def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
    if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
                .swapaxes(1, 2)
@ -93,11 +93,21 @@ if "_name_or_path" in hparams:
 else:
    hf_repo=""
 if "max_sequence_length" in hparams:
    ctx_length = hparams["max_sequence_length"]
 elif "max_position_embeddings" in hparams:
    ctx_length = hparams["max_position_embeddings"]
 else:
    print("gguf: can not find ctx length parameter.")
    sys.exit()
 gguf_writer.add_architecture(llm_arch)
 gguf_writer.add_name(last_dir)
 gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
 gguf_writer.add_source_hf_repo(hf_repo)
-gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
+gguf_writer.add_tensor_data_layout(llm_arch, "Meta AI original pth")
 gguf_writer.add_context_length(llm_arch, ctx_length)
 gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
 gguf_writer.add_block_count(llm_arch, block_count)
 gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"])
@ -189,7 +199,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
 # TENSORS
-tensor_map = tmap.get_tensor_namemap(block_count)
+tensor_map = gguf.get_tensor_name_map(block_count)
 # tensor info
 print("gguf: get tensor metadata")
@ -218,9 +228,9 @@ for part_name in part_names:
        data = data.squeeze().numpy()
-        # permute these
+        # reverse permute these
        if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
-            data = permute(data, head_count, head_count_kv)
+            data = reverse_hf_permute(data, head_count, head_count_kv)
        # map tensor names
        if name.endswith(".weight") and name[:-7] in tensor_map:
@ -287,9 +297,9 @@ for part_name in part_names:
        data = data.squeeze().numpy()
-        # permute these
+        # reverse permute these
        if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
-            data = permute(data, head_count, head_count_kv)
+            data = reverse_hf_permute(data, head_count, head_count_kv)
        # map tensor names
        if name.endswith(".weight") and name[:-7] in tensor_map:
@ -315,7 +325,7 @@ for part_name in part_names:
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
-        print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.write_tensor_to_file(data)
--- a/convert-new.py
+++ b/convert-new.py
@ -104,7 +104,7 @@ TENSORS_SET = set(TENSORS_LIST)
 def find_n_mult(n_ff: int, n_embd: int) -> int:
    # hardcoded magic range
-    for n_mult in range(256, 1, -1):
+    for n_mult in range(8192, 1, -1):
        calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
        if calc_ff == n_ff:
            return n_mult
@ -118,6 +118,7 @@ class Params:
    n_mult:    int
    n_head:    int
    n_layer:   int
    n_kv_head: Optional[int]  # This parameter is only used for Llama 2
    @staticmethod
    def guessed(model: 'LazyModel') -> 'Params':
@ -144,6 +145,7 @@ class Params:
            n_mult    = 256,
            n_head    = n_head,
            n_layer   = n_layer,
            n_kv_head = None,
        )
    @staticmethod
@ -155,6 +157,7 @@ class Params:
        n_head    = config["num_attention_heads"];
        n_layer   = config["num_hidden_layers"];
        n_ff      = config["intermediate_size"];
        n_kv_head = config.get("num_key_value_heads")
        n_mult = find_n_mult(n_ff, n_embd);
@ -164,6 +167,7 @@ class Params:
            n_mult    = n_mult,
            n_head    = n_head,
            n_layer   = n_layer,
            n_kv_head = n_kv_head,
        )
    # LLaMA v2 70B params.json
@ -187,6 +191,7 @@ class Params:
            n_mult    = n_mult,
            n_head    = n_head,
            n_layer   = n_layer,
            n_kv_head = None,
        )
    @staticmethod
@ -293,7 +298,9 @@ class SentencePieceVocab:
 Vocab = Union[BpeVocab, SentencePieceVocab]
-def permute(weights: NDArray, n_head: int) -> NDArray:
+def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
    if n_kv_head is not None and n_head != n_kv_head:
        n_head //= n_kv_head
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
                .swapaxes(1, 2)
                .reshape(weights.shape))
@ -305,7 +312,7 @@ class Tensor(metaclass=ABCMeta):
    @abstractmethod
    def astype(self, data_type: DataType) -> 'Tensor': ...
    @abstractmethod
-    def permute(self, n_head: int) -> 'Tensor': ...
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
    @abstractmethod
    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
    @abstractmethod
@ -343,8 +350,8 @@ class UnquantizedTensor(Tensor):
        r = self.ndarray.shape[0] // 3
        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-    def permute(self, n_head: int) -> 'UnquantizedTensor':
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
-        return UnquantizedTensor(permute(self.ndarray, n_head))
+        return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
 def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@ -367,18 +374,18 @@ GGMLCompatibleTensor = Union[UnquantizedTensor]
 class DeferredPermutedTensor(Tensor):
-    def __init__(self, base: Tensor, n_head: int) -> None:
+    def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
        self.base = base
        self.n_head = n_head
        self.data_type = self.base.data_type
    def astype(self, data_type: DataType) -> Tensor:
-        return self.base.astype(data_type).permute(self.n_head)
+        return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
    def to_ggml(self) -> GGMLCompatibleTensor:
-        return self.base.to_ggml().permute(self.n_head)
+        return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
-    def permute(self, n_head: int) -> Tensor:
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
        raise Exception("shouldn't permute twice")
@ -474,10 +481,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
    return ModelPlus(model, paths, format, vocab)
-def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
    def load() -> Tensor:
-        return lazy_tensor.load().permute(n_head)
+        return lazy_tensor.load().permute(n_head, n_kv_head)
-    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
 def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
    def load() -> Tensor:
@ -502,7 +509,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
-            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "-gqa" || arg == "--gqa") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -546,8 +534,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@ -638,8 +624,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
    lparams.n_gqa           = params.n_gqa;
    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
--- a/examples/common.h
+++ b/examples/common.h
@ -23,14 +23,12 @@ struct gpt_params {
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
 #include <cassert>
@ -502,7 +503,7 @@ bool is_ggml_file(const char *filename) {
        return false;
    }
    uint32_t magic = file.read_u32();
-    return magic == LLAMA_FILE_MAGIC;
+    return magic == GGUF_MAGIC;
 }
 void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
@ -590,75 +591,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
    if (file.fp == NULL) {
        return;
    }
    // write_magic
    file.write_u32(LLAMA_FILE_MAGIC);   // magic
    file.write_u32(LLAMA_FILE_VERSION); // version
    // write_hparams
    file.write_u32(model->hparams.n_vocab);
    file.write_u32(model->hparams.n_embd);
    file.write_u32(model->hparams.n_mult);
    file.write_u32(model->hparams.n_head);
    file.write_u32(model->hparams.n_layer);
    file.write_u32(model->hparams.n_rot);
    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+#pragma message("TODO: implement file saving using gguf")
-    uint32_t n_vocab = model->hparams.n_vocab;
+    (void) vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
+    (void) model;
-        const auto & token_score = vocab->id_to_token.at(i);
+    (void) w;
-        file.write_u32((uint32_t) token_score.tok.size());
+//    // write_magic
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-        file.write_raw(&token_score.score, sizeof(token_score.score));
+//    file.write_u32(LLAMA_FILE_VERSION); // version
-    }
+//    // write_hparams
-
+//    file.write_u32(model->hparams.n_vocab);
-    // stuff AK weights into GG weights one by one.
+//    file.write_u32(model->hparams.n_embd);
-    // w->token_embedding_table -> model->tok_embeddings
+//    file.write_u32(model->hparams.n_mult);
-    // float*                   -> struct ggml_tensor
+//    file.write_u32(model->hparams.n_head);
-    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+//    file.write_u32(model->hparams.n_layer);
-    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+//    file.write_u32(model->hparams.n_rot);
-
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+//
-    //print_row(model->norm, 0);
+//    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
-
+//    uint32_t n_vocab = model->hparams.n_vocab;
-    // for rms-att-weight
+//    for (uint32_t i = 0; i < n_vocab; i++) {
-    int row_length = model->hparams.n_embd;
+//        const auto & token_score = vocab->id_to_token.at(i);
-    const auto & hparams = model->hparams;
+//        file.write_u32((uint32_t) token_score.tok.size());
-    //int n_ff = model->hparams.n_embd;
+//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-    int n_ff = get_n_ff(&hparams);
+//        file.write_raw(&token_score.score, sizeof(token_score.score));
-
+//    }
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+//
-        auto & layer = model->layers[i];
+//    // stuff AK weights into GG weights one by one.
-        // 1d
+//    // w->token_embedding_table -> model->tok_embeddings
-        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+//    // float*                   -> struct ggml_tensor
-        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+//    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
-
+//    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+//
-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+//    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+//    //print_row(model->norm, 0);
-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+//
-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+//    // for rms-att-weight
-
+//    int row_length = model->hparams.n_embd;
-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+//    const auto & hparams = model->hparams;
-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+//    //int n_ff = model->hparams.n_embd;
-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+//    int n_ff = get_n_ff(&hparams);
-    }
+//
-    // write tensors
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
-    write_tensor(&file, model->tok_embeddings);
+//        auto & layer = model->layers[i];
-    write_tensor(&file, model->norm);
+//        // 1d
-    write_tensor(&file, model->output); // ?
+//        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-        auto & layer = model->layers[i];
+//
-
+//        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        write_tensor(&file, layer.attention_norm);
+//        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        write_tensor(&file, layer.wq);
+//        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        write_tensor(&file, layer.wk);
+//        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-        write_tensor(&file, layer.wv);
+//        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        write_tensor(&file, layer.wo);
+//
-        write_tensor(&file, layer.ffn_norm);
+//        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        write_tensor(&file, layer.w1);
+//        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        write_tensor(&file, layer.w2);
+//        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
-        write_tensor(&file, layer.w3);
+//    }
-    }
+//    // write tensors
 //    write_tensor(&file, model->tok_embeddings);
 //    write_tensor(&file, model->norm);
 //    write_tensor(&file, model->output); // ?
 //    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
 //        auto & layer = model->layers[i];
 //
 //        write_tensor(&file, layer.attention_norm);
 //        write_tensor(&file, layer.wq);
 //        write_tensor(&file, layer.wk);
 //        write_tensor(&file, layer.wv);
 //        write_tensor(&file, layer.wo);
 //        write_tensor(&file, layer.ffn_norm);
 //        write_tensor(&file, layer.w1);
 //        write_tensor(&file, layer.w2);
 //        write_tensor(&file, layer.w3);
 //    }
 }
 struct train_params get_default_train_params() {
--- a/examples/gguf/gguf-llama-simple.cpp
+++ b/examples/gguf/gguf-llama-simple.cpp
@ -1,126 +0,0 @@
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "common.h"
 #include "gguf-llama.h"
 #include "build-info.h"
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 int main(int argc, char ** argv) {
    gpt_params params;
    if (argc == 1 || argv[1][0] == '-') {
        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
        return 1 ;
    }
    if (argc >= 2) {
        params.model = argv[1];
    }
    if (argc >= 3) {
        params.prompt = argv[2];
    }
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }
    // init LLM
    llama_backend_init(params.numa);
    llama_context_params ctx_params = llama_context_default_params();
    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    // tokenize the prompt
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;
    if ((int) tokens_list.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }
    fprintf(stderr, "\n\n");
    for (auto id : tokens_list) {
        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
    }
    fflush(stderr);
    // main loop
    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
    while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
        // evaluate the transformer
        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
        tokens_list.clear();
        // sample the next token
        llama_token new_token_id = 0;
        auto logits  = llama_get_logits(ctx);
        auto n_vocab = llama_n_vocab(ctx);
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
        // is it an end of stream ?
        if (new_token_id == llama_token_eos()) {
            fprintf(stderr, " [end of text]\n");
            break;
        }
        // print the new token :
        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
        fflush(stdout);
        // push this new token for next evaluation
        tokens_list.push_back(new_token_id);
    }
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
    return 0;
 }
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -1,6 +1,5 @@
 #include "ggml.h"
-#include "gguf-util.h"
+#include "llama.h"
 #include "gguf-llama.h"
 #include <cstdio>
 #include <cinttypes>
@ -21,133 +20,22 @@ static std::string to_string(const T & val) {
    return ss.str();
 }
 void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
    const int32_t n = val.size();
    fout.write((const char *) &n, sizeof(n));
    fout.write(val.c_str(), n);
 }
 void gguf_ex_write_i32(std::ofstream & fout, int32_t val) {
    fout.write((const char *) &val, sizeof(val));
 }
 void gguf_ex_write_u64(std::ofstream & fout, size_t val) {
    fout.write((const char *) &val, sizeof(val));
 }
 template<typename T>
 void gguf_ex_write_val(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
    gguf_ex_write_str(fout, key);
    fout.write((const char *) &type, sizeof(type));
    fout.write((const char *) &val,  sizeof(val));
    fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), to_string(val).c_str());
 }
 template<>
 void gguf_ex_write_val<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
    gguf_ex_write_str(fout, key);
    fout.write((const char *) &type, sizeof(type));
    const int32_t n = val.size();
    fout.write((const char *) &n, sizeof(n));
    fout.write(val.c_str(), n);
    fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), val.c_str());
 }
 template<typename T>
 void gguf_ex_write_arr(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<T> & val) {
    gguf_ex_write_str(fout, key);
    {
        const enum gguf_type tarr = GGUF_TYPE_ARRAY;
        fout.write((const char *) &tarr, sizeof(tarr));
    }
    const int32_t n = val.size();
    fout.write((const char *) &type, sizeof(type));
    fout.write((const char *) &n,    sizeof(n));
    fout.write((const char *) val.data(), n * sizeof(T));
    fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
    for (int i = 0; i < n; ++i) {
        fprintf(stdout, "%s", to_string(val[i]).c_str());
        if (i < n - 1) {
            fprintf(stdout, ", ");
        }
    }
    fprintf(stdout, "]\n");
 }
 template<>
 void gguf_ex_write_arr<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
    gguf_ex_write_str(fout, key);
    {
        const enum gguf_type tarr = GGUF_TYPE_ARRAY;
        fout.write((const char *) &tarr, sizeof(tarr));
    }
    const int32_t n = val.size();
    fout.write((const char *) &type, sizeof(type));
    fout.write((const char *) &n,    sizeof(n));
    for (int i = 0; i < n; ++i) {
        const int32_t nstr = val[i].size();
        fout.write((const char *) &nstr, sizeof(nstr));
        fout.write(val[i].c_str(), nstr);
    }
    fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
    for (int i = 0; i < n; ++i) {
        fprintf(stdout, "%s", val[i].c_str());
        if (i < n - 1) {
            fprintf(stdout, ", ");
        }
    }
    fprintf(stdout, "]\n");
 }
 bool gguf_ex_write(const std::string & fname) {
-    std::ofstream fout(fname.c_str(), std::ios::binary);
+    struct gguf_context * ctx = gguf_init_empty();
-    {
+    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
-        const int32_t magic = GGUF_MAGIC;
+    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
-        fout.write((const char *) &magic, sizeof(magic));
+    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
-    }
+    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
-    {
+    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
-        const int32_t version = GGUF_VERSION;
+    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
-        fout.write((const char *) &version, sizeof(version));
+    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
    }
    // NOTE: these have to match the output below!
    const int n_tensors = 10;
    const int n_kv      = 12;
    fout.write((const char*) &n_tensors, sizeof(n_tensors));
    fout.write((const char*) &n_kv, sizeof(n_kv));
    fprintf(stdout, "%s: write header\n", __func__);
    // kv data
    {
        gguf_ex_write_val< uint8_t>(fout, "some.parameter.uint8",   GGUF_TYPE_UINT8,   0x12);
        gguf_ex_write_val<  int8_t>(fout, "some.parameter.int8",    GGUF_TYPE_INT8,   -0x13);
        gguf_ex_write_val<uint16_t>(fout, "some.parameter.uint16",  GGUF_TYPE_UINT16,  0x1234);
        gguf_ex_write_val< int16_t>(fout, "some.parameter.int16",   GGUF_TYPE_INT16,  -0x1235);
        gguf_ex_write_val<uint32_t>(fout, "some.parameter.uint32",  GGUF_TYPE_UINT32,  0x12345678);
        gguf_ex_write_val< int32_t>(fout, "some.parameter.int32",   GGUF_TYPE_INT32,  -0x12345679);
        gguf_ex_write_val<float>   (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
        gguf_ex_write_val<bool>    (fout, "some.parameter.bool",    GGUF_TYPE_BOOL,    true);
        gguf_ex_write_val<std::string>(fout, "some.parameter.string",  GGUF_TYPE_STRING,  "hello world");
        gguf_ex_write_arr<int16_t>    (fout, "some.parameter.arr.i16", GGUF_TYPE_INT16,   { 1, 2, 3, 4, });
        gguf_ex_write_arr<float>      (fout, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, { 3.145f, 2.718f, 1.414f, });
        gguf_ex_write_arr<std::string>(fout, "some.parameter.arr.str", GGUF_TYPE_STRING,  { "hello", "world", "!" });
    }
    uint64_t offset_tensor = 0;
    struct ggml_init_params params = {
        /*.mem_size   =*/ 128ull*1024ull*1024ull,
@ -157,6 +45,8 @@ bool gguf_ex_write(const std::string & fname) {
    struct ggml_context * ctx_data = ggml_init(params);
    const int n_tensors = 10;
    // tensor infos
    for (int i = 0; i < n_tensors; ++i) {
        const std::string name = "tensor_" + to_string(i);
@ -178,58 +68,15 @@ bool gguf_ex_write(const std::string & fname) {
            }
        }
-        fprintf(stdout, "%s: tensor: %s, %d dims, ne = [", __func__, name.c_str(), n_dims);
+        gguf_add_tensor(ctx, cur);
        for (int j = 0; j < 4; ++j) {
            fprintf(stdout, "%s%3d", j == 0 ? "" : ", ", (int) cur->ne[j]);
        }
        fprintf(stdout, "], offset_tensor = %6" PRIu64 "\n", offset_tensor);
        gguf_ex_write_str(fout, name);
        gguf_ex_write_i32(fout, n_dims);
        for (int j = 0; j < n_dims; ++j) {
            gguf_ex_write_i32(fout, cur->ne[j]);
        }
        gguf_ex_write_i32(fout, cur->type);
        gguf_ex_write_u64(fout, offset_tensor);
        offset_tensor += GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT);
    }
-    const uint64_t offset_data = GGML_PAD((uint64_t) fout.tellp(), GGUF_DEFAULT_ALIGNMENT);
+    gguf_write_to_file(ctx, fname.c_str(), false);
    fprintf(stdout, "%s: data offset = %" PRIu64 "\n", __func__, offset_data);
    {
        const size_t pad = offset_data - fout.tellp();
        for (size_t j = 0; j < pad; ++j) {
            fout.put(0);
        }
    }
    for (int i = 0; i < n_tensors; ++i) {
        fprintf(stdout, "%s: writing tensor %d data\n", __func__, i);
        const std::string name = "tensor_" + to_string(i);
        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
        fout.write((const char *) cur->data, ggml_nbytes(cur));
        {
            const size_t pad = GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT) - ggml_nbytes(cur);
            for (size_t j = 0; j < pad; ++j) {
                fout.put(0);
            }
        }
    }
    fout.close();
    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
@ -345,8 +192,16 @@ bool gguf_ex_read_1(const std::string & fname) {
            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
-            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n",
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
-                    __func__, i, cur->n_dims, cur->name, cur->data);
+
            // print first 10 elements
            const float * data = (const float *) cur->data;
            printf("%s data[:10] : ", name);
            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
                printf("%f ", data[j]);
            }
            printf("\n\n");
            // check data
            {
@ -369,48 +224,6 @@ bool gguf_ex_read_1(const std::string & fname) {
    return true;
 }
 // read just the tensor info and mmap the data in user code
 bool gguf_ex_read_2(const std::string & fname) {
    struct ggml_context * ctx_data = NULL;
    struct gguf_init_params params = {
        /*.no_alloc = */ true,
        /*.ctx      = */ &ctx_data,
    };
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    struct gguf_file file(fname.c_str(), "rb");
    gguf_mmap data_mmap(&file, 0, false);
    const int n_tensors = gguf_get_n_tensors(ctx);
    for (int i = 0; i < n_tensors; ++i) {
        const char * name   = gguf_get_tensor_name(ctx, i);
        const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
        cur->data = static_cast<char *>(data_mmap.addr) + offset;
        // print first 10 elements
        const float * data = (const float *) cur->data;
        printf("%s data[:10] : ", name);
        for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
            printf("%f ", data[j]);
        }
        printf("\n\n");
    }
    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
    ggml_free(ctx_data);
    gguf_free(ctx);
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc < 3) {
        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
@ -427,7 +240,6 @@ int main(int argc, char ** argv) {
    } else if (mode == "r") {
        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
        GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
    } else if (mode == "q") {
        llama_model_quantize_params params = llama_model_quantize_default_params();
        llama_model_quantize(fname.c_str(), "quant.gguf", &params);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -266,9 +266,6 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }
    // determine newline token
    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
@ -778,8 +775,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);
-                        std::vector<const llama_grammar_element *> grammar_rules(
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                            parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
+        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
    }
    else {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();
    lparams.n_ctx     = params.n_ctx;
    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -651,8 +651,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@ -773,23 +771,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
        else if (arg == "-gqa" || arg == "--gqa")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        }
        else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -36,16 +36,17 @@ int main(int argc, char ** argv) {
    llama_backend_init(params.numa);
-    llama_model * model;
+    llama_context_params ctx_params = llama_context_default_params();
    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    // tokenize the prompt
    std::vector<llama_token> tokens_list;
@ -54,7 +55,7 @@ int main(int argc, char ** argv) {
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;
-    if ((int)tokens_list.size() > max_tokens_list_size) {
+    if ((int) tokens_list.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }
@ -74,7 +75,9 @@ int main(int argc, char ** argv) {
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
-    while (llama_get_kv_cache_token_count( ctx ) < max_context_size) {
+    const int n_gen = std::min(32, max_context_size);
    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
        // evaluate the transformer
        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
@ -114,7 +117,6 @@ int main(int argc, char ** argv) {
        // push this new token for next evaluation
        tokens_list.push_back(new_token_id);
    }
    llama_free(ctx);
@ -122,5 +124,7 @@ int main(int argc, char ** argv) {
    llama_backend_free();
    fprintf(stderr, "\n\n");
    return 0;
 }
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -17,7 +17,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+static const float rms_norm_eps = 1e-5f;
 struct random_normal_distribution {
    std::mt19937 gen;
@ -2612,42 +2612,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
        return;
    }
-    // write_magic
+#pragma message("TODO: implement file saving using gguf")
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    (void) vocab;
-    file.write_u32(LLAMA_FILE_VERSION); // version
+    (void) model;
-    // write_hparams
+//    // write_magic
-    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(LLAMA_FILE_VERSION); // version
-    file.write_u32(model->hparams.n_mult);
+//    // write_hparams
-    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(model->hparams.n_mult);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//    file.write_u32(model->hparams.n_head);
-    // write_vocab
+//    file.write_u32(model->hparams.n_layer);
-    uint32_t n_vocab = model->hparams.n_vocab;
+//    file.write_u32(model->hparams.n_rot);
-    for (uint32_t i = 0; i < n_vocab; i++) {
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
-        const auto & token_score = vocab->id_to_token.at(i);
+//    // write_vocab
-        file.write_u32((uint32_t) token_score.tok.size());
+//    uint32_t n_vocab = model->hparams.n_vocab;
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
+//    for (uint32_t i = 0; i < n_vocab; i++) {
-        file.write_raw(&token_score.score, sizeof(token_score.score));
+//        const auto & token_score = vocab->id_to_token.at(i);
-    }
+//        file.write_u32((uint32_t) token_score.tok.size());
-    // write tensors
+//        file.write_raw(token_score.tok.data(), token_score.tok.size());
-    write_tensor(&file, model->tok_embeddings);
+//        file.write_raw(&token_score.score, sizeof(token_score.score));
-    write_tensor(&file, model->norm);
+//    }
-    write_tensor(&file, model->output);
+//    // write tensors
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//    write_tensor(&file, model->tok_embeddings);
-        auto & layer = model->layers[i];
+//    write_tensor(&file, model->norm);
-
+//    write_tensor(&file, model->output);
-        write_tensor(&file, layer.attention_norm);
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        write_tensor(&file, layer.wq);
+//        auto & layer = model->layers[i];
-        write_tensor(&file, layer.wk);
+//
-        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.w3);
+//        write_tensor(&file, layer.ffn_norm);
-    }
+//        write_tensor(&file, layer.w1);
 //        write_tensor(&file, layer.w2);
 //        write_tensor(&file, layer.w3);
 //    }
 }
 float cosine_decay(const int decay_steps, const float alpha, int step) {
--- a/ggml.c
+++ b/ggml.c
@ -213,10 +213,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
                error_desc = "insufficient memory";
                break;
        }
-        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
+        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
            __func__, error_desc, size/(1024.0*1024.0));
        return NULL;
    }
    return aligned_memory;
 }
 #define GGML_ALIGNED_MALLOC(size)  ggml_aligned_malloc(size)
@ -4109,7 +4109,11 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
    //
    // is enough, but just in case, adding the second part
-    return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
+    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
 }
 size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
 }
 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
@ -9136,6 +9140,8 @@ static void ggml_compute_forward_mul(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
@ -16899,7 +16905,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
    // compute size of intermediate results
    // TODO: does not take into account scratch buffers !!!!
    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        size_eval += ggml_nbytes(cgraph->nodes[i]);
+        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
    }
    // print
@ -18579,6 +18585,20 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
 };
 static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
 static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_UINT8]   = "u8",
    [GGUF_TYPE_INT8]    = "i8",
    [GGUF_TYPE_UINT16]  = "u16",
    [GGUF_TYPE_INT16]   = "i16",
    [GGUF_TYPE_UINT32]  = "u32",
    [GGUF_TYPE_INT32]   = "i32",
    [GGUF_TYPE_FLOAT32] = "f32",
    [GGUF_TYPE_BOOL]    = "bool",
    [GGUF_TYPE_STRING]  = "str",
    [GGUF_TYPE_ARRAY]   = "arr",
 };
 static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
 union gguf_value {
    uint8_t  uint8;
    int8_t   int8;
@ -18613,8 +18633,6 @@ struct gguf_header {
    uint32_t version;
    uint32_t n_tensors;
    uint32_t n_kv;
    struct gguf_kv * kv;
 };
 struct gguf_tensor_info {
@ -18622,44 +18640,69 @@ struct gguf_tensor_info {
    uint32_t n_dims;
    uint32_t ne[GGML_MAX_DIMS];
    uint32_t n_elms; // TODO: is this needed?
    enum ggml_type type;
    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
    // for writing API
    const void * data;
    size_t size;
 };
 struct gguf_context {
    struct gguf_header header;
    struct gguf_kv          * kv;
    struct gguf_tensor_info * infos;
    size_t alignment;
    size_t offset;    // offset of `data` from beginning of file
-    size_t size_data; // size of `data` in bytes
+    size_t size;      // size of `data` in bytes
    //uint8_t * padding;
-    uint8_t * data;
+    void * data;
 };
-static bool gguf_fread_el(void * dst, size_t size, FILE * file, size_t * offset) {
+static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
    const size_t n = fread(dst, 1, size, file);
    *offset += n;
    return n == size;
 }
-static bool gguf_fread_str(struct gguf_str * p, FILE * file, size_t * offset) {
+static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
    p->n    = 0;
    p->data = NULL;
    bool ok = true;
    // TODO: how to avoid mallocs for strings?
-    ok = ok && gguf_fread_el(&p->n,    sizeof(p->n), file, offset); p->data = calloc(p->n + 1, 1);
+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
-    ok = ok && gguf_fread_el( p->data, p->n,         file, offset);
+    ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
    return ok;
 }
 struct gguf_context * gguf_init_empty(void) {
    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
    ctx->header.magic     = GGUF_MAGIC;
    ctx->header.version   = GGUF_VERSION;
    ctx->header.n_tensors = 0;
    ctx->header.n_kv      = 0;
    ctx->kv    = NULL;
    ctx->infos = NULL;
    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
    ctx->offset    = 0;
    ctx->size      = 0;
    ctx->data = NULL;
    return ctx;
 }
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
    FILE * file = fopen(fname, "rb");
    if (!file) {
@ -18673,7 +18716,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // check the magic before making allocations
    {
-        gguf_fread_el(&magic, sizeof(magic), file, &offset);
+        gguf_fread_el(file, &magic, sizeof(magic), &offset);
        if (magic != GGUF_MAGIC) {
            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
@ -18689,14 +18732,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // read the header
    {
        ctx->header.magic = magic;
        ctx->header.kv    = NULL;
        ctx->kv    = NULL;
        ctx->infos = NULL;
        ctx->data  = NULL;
-        ok = ok && gguf_fread_el(&ctx->header.version,   sizeof(ctx->header.version),   file, &offset);
+        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
-        ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset);
+        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
-        ok = ok && gguf_fread_el(&ctx->header.n_kv,      sizeof(ctx->header.n_kv),      file, &offset);
+        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
        if (!ok) {
            fprintf(stderr, "%s: failed to read header\n", __func__);
@ -18708,33 +18751,33 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // read the kv pairs
    {
-        ctx->header.kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+        ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
-            struct gguf_kv * kv = &ctx->header.kv[i];
+            struct gguf_kv * kv = &ctx->kv[i];
            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
-            ok = ok && gguf_fread_str(&kv->key,                          file, &offset);
+            ok = ok && gguf_fread_str(file, &kv->key,                          &offset);
-          //ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset);
+          //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
-            ok = ok && gguf_fread_el (&kv->type,    sizeof(kv->type),    file, &offset);
+            ok = ok && gguf_fread_el (file, &kv->type,    sizeof(kv->type),    &offset);
            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
            switch (kv->type) {
-                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (&kv->value.uint8,   sizeof(kv->value.uint8),   file, &offset); break;
+                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
-                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (&kv->value.int8,    sizeof(kv->value.int8),    file, &offset); break;
+                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
-                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (&kv->value.uint16,  sizeof(kv->value.uint16),  file, &offset); break;
+                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
-                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (&kv->value.int16,   sizeof(kv->value.int16),   file, &offset); break;
+                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
-                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (&kv->value.uint32,  sizeof(kv->value.uint32),  file, &offset); break;
+                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
-                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (&kv->value.int32,   sizeof(kv->value.int32),   file, &offset); break;
+                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
-                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
+                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
-                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
+                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
-                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
+                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
                case GGUF_TYPE_ARRAY:
                    {
-                        ok = ok && gguf_fread_el(&kv->value.arr.type, sizeof(kv->value.arr.type), file, &offset);
+                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-                        ok = ok && gguf_fread_el(&kv->value.arr.n,    sizeof(kv->value.arr.n),    file, &offset);
+                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);
                        switch (kv->value.arr.type) {
                            case GGUF_TYPE_UINT8:
@ -18747,17 +18790,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                            case GGUF_TYPE_BOOL:
                                {
                                    kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
-                                    ok = ok && gguf_fread_el(kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], file, &offset);
+                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
                                } break;
                            case GGUF_TYPE_STRING:
                                {
                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
-                                        ok = ok && gguf_fread_str(&((struct gguf_str *) kv->value.arr.data)[j], file, &offset);
+                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                    }
                                } break;
                            case GGUF_TYPE_ARRAY:
-                            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+                            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
                        };
                    } break;
                case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
@ -18787,14 +18830,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                info->ne[j] = 1;
            }
-            ok = ok && gguf_fread_str(&info->name,                          file, &offset);
+            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
-            ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims),  file, &offset);
+            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                ok = ok && gguf_fread_el(&info->ne[j], sizeof(info->ne[j]), file, &offset);
+                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
            }
-          //ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms),  file, &offset);
+            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
-            ok = ok && gguf_fread_el (&info->type,   sizeof(info->type),    file, &offset);
+            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
            ok = ok && gguf_fread_el (&info->offset, sizeof(info->offset),  file, &offset);
            if (!ok) {
                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
@ -18827,8 +18869,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // compute the total size of the data section, taking into account the alignment
    {
-
+        ctx->size = 0;
        ctx->size_data = 0;
        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];
@ -18848,7 +18889,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
-            ctx->size_data += GGML_PAD(size_cur, ctx->alignment);
+            ctx->size += GGML_PAD(size_cur, ctx->alignment);
        }
    }
@ -18862,7 +18903,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        const size_t mem_size =
            params.no_alloc ?
            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
-            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size_data;
+            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
        struct ggml_init_params pdata = {
            .mem_size   = mem_size,
@ -18877,12 +18918,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        struct ggml_tensor * data = NULL;
        if (params.no_alloc == false) {
-            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size_data);
+            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
            ok = ok && data != NULL;
            // read the binary blob with the tensor data
-            ok = ok && gguf_fread_el(data->data, ctx->size_data, file, &offset);
+            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
            if (!ok) {
                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
@ -18944,10 +18985,10 @@ void gguf_free(struct gguf_context * ctx) {
        return;
    }
-    if (ctx->header.kv) {
+    if (ctx->kv) {
        // free string memory - not great..
        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
-            struct gguf_kv * kv = &ctx->header.kv[i];
+            struct gguf_kv * kv = &ctx->kv[i];
            if (kv->key.data) {
                free(kv->key.data);
@ -18974,7 +19015,7 @@ void gguf_free(struct gguf_context * ctx) {
            }
        }
-        GGML_ALIGNED_FREE(ctx->header.kv);
+        GGML_ALIGNED_FREE(ctx->kv);
    }
    if (ctx->infos) {
@ -18992,6 +19033,10 @@ void gguf_free(struct gguf_context * ctx) {
    GGML_ALIGNED_FREE(ctx);
 }
 const char * gguf_type_name(enum gguf_type type) {
    return GGUF_TYPE_NAME[type];
 }
 int gguf_get_version(struct gguf_context * ctx) {
    return ctx->header.version;
 }
@ -19014,9 +19059,10 @@ int gguf_get_n_kv(struct gguf_context * ctx) {
 int gguf_find_key(struct gguf_context * ctx, const char * key) {
    // return -1 if key not found
    const int n_kv = gguf_get_n_kv(ctx);
    int keyfound = -1;
    const int n_kv = gguf_get_n_kv(ctx);
    for (int i = 0; i < n_kv; ++i) {
        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
            keyfound = i;
@ -19028,71 +19074,87 @@ int gguf_find_key(struct gguf_context * ctx, const char * key) {
 }
 const char * gguf_get_key(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].key.data;
+    return ctx->kv[i].key.data;
 }
 enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].type;
+    return ctx->kv[i].type;
 }
 enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.arr.type;
+    return ctx->kv[i].value.arr.type;
 }
 const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
    return ctx->kv[i].value.arr.data;
 }
 const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
-    struct gguf_kv * kv = &ctx->header.kv[key_id];
+    struct gguf_kv * kv = &ctx->kv[key_id];
    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
    return str->data;
 }
 float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i) {
    return ((float *) ctx->header.kv[key_id].value.arr.data)[i];
 }
 int gguf_get_arr_n(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.arr.n;
+    return ctx->kv[i].value.arr.n;
 }
 uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.uint8;
+    return ctx->kv[i].value.uint8;
 }
 int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.int8;
+    return ctx->kv[i].value.int8;
 }
 uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.uint16;
+    return ctx->kv[i].value.uint16;
 }
 int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.int16;
+    return ctx->kv[i].value.int16;
 }
 uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.uint32;
+    return ctx->kv[i].value.uint32;
 }
 int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.int32;
+    return ctx->kv[i].value.int32;
 }
 float gguf_get_val_f32(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.float32;
+    return ctx->kv[i].value.float32;
 }
 bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.bool_;
+    return ctx->kv[i].value.bool_;
 }
 const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
-    return ctx->header.kv[i].value.str.data;
+    return ctx->kv[i].value.str.data;
 }
 int gguf_get_n_tensors(struct gguf_context * ctx) {
    return ctx->header.n_tensors;
 }
 int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
    // return -1 if tensor not found
    int tensorfound = -1;
    const int n_tensors = gguf_get_n_tensors(ctx);
    for (int i = 0; i < n_tensors; ++i) {
        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
            tensorfound = i;
            break;
        }
    }
    return tensorfound;
 }
 size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
    return ctx->infos[i].offset;
 }
@ -19101,6 +19163,406 @@ char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
    return ctx->infos[i].name.data;
 }
 // returns the index
 static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
    const int idx = gguf_find_key(ctx, key);
    if (idx >= 0) {
        return idx;
    }
    const int n_kv = gguf_get_n_kv(ctx);
    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
    ctx->kv[n_kv].key.n    = strlen(key) + 1;
    ctx->kv[n_kv].key.data = strdup(key);
    ctx->header.n_kv++;
    return n_kv;
 }
 void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
    ctx->kv[idx].value.uint8 = val;
 }
 void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type       = GGUF_TYPE_INT8;
    ctx->kv[idx].value.int8 = val;
 }
 void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
    ctx->kv[idx].value.uint16 = val;
 }
 void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type        = GGUF_TYPE_INT16;
    ctx->kv[idx].value.int16 = val;
 }
 void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
    ctx->kv[idx].value.uint32 = val;
 }
 void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type        = GGUF_TYPE_INT32;
    ctx->kv[idx].value.int32 = val;
 }
 void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
    ctx->kv[idx].value.float32 = val;
 }
 void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
    ctx->kv[idx].value.bool_ = val;
 }
 void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type           = GGUF_TYPE_STRING;
    ctx->kv[idx].value.str.n    = strlen(val) + 1;
    ctx->kv[idx].value.str.data = strdup(val);
 }
 void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = type;
    ctx->kv[idx].value.arr.n    = n;
    ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
    memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
 }
 void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
    const int idx = gguf_get_or_add_key(ctx, key);
    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
    ctx->kv[idx].value.arr.n    = n;
    ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
    for (int i = 0; i < n; i++) {
        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
        str->n    = strlen(data[i]) + 1;
        str->data = strdup(data[i]);
    }
 }
 // set or add KV pairs from another context
 void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
    for (uint32_t i = 0; i < src->header.n_kv; i++) {
        switch (src->kv[i].type) {
            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
            case GGUF_TYPE_ARRAY:
                {
                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
                        free(data);
                    } if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
                        GGML_ASSERT(false && "nested arrays not supported");
                    } else {
                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
                    }
                } break;
            case GGUF_TYPE_COUNT:  GGML_ASSERT(false && "invalid type"); break;
        }
    }
 }
 void gguf_add_tensor(
             struct gguf_context * ctx,
        const struct ggml_tensor * tensor) {
    const int idx = ctx->header.n_tensors;
    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
    ctx->infos[idx].name.n    = strlen(tensor->name) + 1;
    ctx->infos[idx].name.data = strdup(tensor->name);
    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
        ctx->infos[idx].ne[i] = 1;
    }
    ctx->infos[idx].n_dims = tensor->n_dims;
    for (int i = 0; i < tensor->n_dims; i++) {
        ctx->infos[idx].ne[i] = tensor->ne[i];
    }
    ctx->infos[idx].type   = tensor->type;
    ctx->infos[idx].offset = 0;
    ctx->infos[idx].data   = tensor->data;
    ctx->infos[idx].size   = ggml_nbytes(tensor);
    if (ctx->header.n_tensors > 0) {
        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
    }
    ctx->header.n_tensors++;
 }
 void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ASSERT(false && "tensor not found");
    }
    ctx->infos[idx].type = type;
 }
 void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ASSERT(false && "tensor not found");
    }
    ctx->infos[idx].data = data;
    ctx->infos[idx].size = size;
    // update offsets
    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
    }
 }
 //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
 //    fwrite(&val->n,   sizeof(val->n),    1, file);
 //    fwrite(val->data, sizeof(char), val->n, file);
 //}
 //
 //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
 //    fwrite(val, sizeof(char), size, file);
 //}
 struct gguf_buf {
    void * data;
    size_t size;
    size_t offset;
 };
 static struct gguf_buf gguf_buf_init(size_t size) {
    struct gguf_buf buf = {
        /*buf.data   =*/ size == 0 ? NULL : malloc(size),
        /*buf.size   =*/ size,
        /*buf.offset =*/ 0,
    };
    return buf;
 }
 static void gguf_buf_free(struct gguf_buf buf) {
    if (buf.data) {
        free(buf.data);
    }
 }
 static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
    if (buf->offset + size > buf->size) {
        buf->size = 1.5*(buf->offset + size);
        if (buf->data) {
            buf->data = realloc(buf->data, buf->size);
        }
    }
 }
 static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
    gguf_buf_grow(buf, sizeof(val->n) + val->n);
    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
    }
    buf->offset += sizeof(val->n);
    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, val->data, val->n);
    }
    buf->offset += val->n;
 }
 static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
    gguf_buf_grow(buf, el_size);
    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, val, el_size);
    }
    buf->offset += el_size;
 }
 static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
    // write header
    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
    // write key-value pairs
    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
        struct gguf_kv * kv = &ctx->kv[i];
        gguf_bwrite_str(buf, &kv->key);
        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
        switch (kv->type) {
            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
            case GGUF_TYPE_ARRAY:
                {
                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
                    switch (kv->value.arr.type) {
                        case GGUF_TYPE_UINT8:
                        case GGUF_TYPE_INT8:
                        case GGUF_TYPE_UINT16:
                        case GGUF_TYPE_INT16:
                        case GGUF_TYPE_UINT32:
                        case GGUF_TYPE_INT32:
                        case GGUF_TYPE_FLOAT32:
                        case GGUF_TYPE_BOOL:
                            {
                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
                            } break;
                        case GGUF_TYPE_STRING:
                            {
                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
                                }
                            } break;
                        case GGUF_TYPE_ARRAY:
                        case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
                    };
                } break;
            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
        };
    }
    // write tensor infos
    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
        struct gguf_tensor_info * info = &ctx->infos[i];
        gguf_bwrite_str(buf, &info->name);
        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
        for (uint32_t j = 0; j < info->n_dims; ++j) {
            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
        }
        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
    }
    // we require the data section to be aligned, so take into account any padding
    {
        const size_t offset     = buf->offset;
        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
        if (offset_pad != offset) {
            uint8_t pad = 0;
            for (size_t i = 0; i < offset_pad - offset; ++i) {
                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }
    }
    if (only_meta) {
        return;
    }
    size_t offset = 0;
    // write tensor data
    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
        struct gguf_tensor_info * info = &ctx->infos[i];
        const size_t size     = info->size;
        const size_t size_pad = GGML_PAD(size, ctx->alignment);
        gguf_bwrite_el(buf, info->data, size);
        if (size_pad != size) {
            uint8_t pad = 0;
            for (size_t j = 0; j < size_pad - size; ++j) {
                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }
        GGML_ASSERT(offset == info->offset);
        offset += size_pad;
    }
 }
 void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
    FILE * file = fopen(fname, "wb");
    if (!file) {
        GGML_ASSERT(false && "failed to open file for writing");
    }
    struct gguf_buf buf = gguf_buf_init(16*1024);
    gguf_write_to_buf(ctx, &buf, only_meta);
    fwrite(buf.data, 1, buf.offset, file);
    gguf_buf_free(buf);
    fclose(file);
 }
 size_t gguf_get_meta_size(struct gguf_context * ctx) {
    // no allocs - only compute size
    struct gguf_buf buf = gguf_buf_init(0);
    gguf_write_to_buf(ctx, &buf, true);
    return buf.offset;
 }
 void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
    struct gguf_buf buf = gguf_buf_init(16*1024);
    gguf_write_to_buf(ctx, &buf, true);
    memcpy(data, buf.data, buf.offset);
    gguf_buf_free(buf);
 }
 ////////////////////////////////////////////////////////////////////////////////
 int ggml_cpu_has_avx(void) {
--- a/ggml.h
+++ b/ggml.h
@ -566,6 +566,7 @@ extern "C" {
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
    GGML_API int     ggml_blck_size (enum ggml_type type);
@ -1498,7 +1499,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
@ -1711,7 +1711,6 @@ extern "C" {
    // gguf
    //
    // TODO: can be removed if the API is extended for writing
    enum gguf_type {
        GGUF_TYPE_UINT8   = 0,
        GGUF_TYPE_INT8    = 1,
@ -1735,10 +1734,14 @@ extern "C" {
        struct ggml_context ** ctx;
    };
    GGML_API struct gguf_context * gguf_init_empty(void);
    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
    GGML_API void gguf_free(struct gguf_context * ctx);
    GGML_API const char * gguf_type_name(enum gguf_type type);
    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
@ -1747,13 +1750,11 @@ extern "C" {
    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
-    GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
    GGML_API void           gguf_get_val (struct gguf_context * ctx, int i, void * val);
    GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
    GGML_API float        gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);
    // results are undefined if the wrong type is used for the key
    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
@ -1764,12 +1765,60 @@ extern "C" {
    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
-    GGML_API void         gguf_get_arr_data(struct gguf_context * ctx, int i, void * data);
+    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
    // set or add KV pairs from another context
    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
    // manage tensor info
    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
    // writing gguf files can be done in 2 ways:
    //
    // - write the entire gguf_context to a binary file in a single pass:
    //
    //   gguf_write_to_file(ctx, fname);
    //
    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
    //
    //   FILE * f = fopen(fname, "wb");
    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
    //   fwrite(f, ...);
    //   void * data = gguf_meta_get_meta_data(ctx);
    //   fseek(f, 0, SEEK_SET);
    //   fwrite(f, data, gguf_get_meta_size(ctx));
    //   free(data);
    //   fclose(f);
    //
    // write the entire context to a binary file
    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
    //
    // system info
    //
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
--- a/gguf-llama.h
+++ b/gguf-llama.h
@ -1,505 +0,0 @@
 #ifndef LLAMA_H
 #define LLAMA_H
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
 #else
 #define LLAMA_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
 #            define LLAMA_API __declspec(dllexport)
 #        else
 #            define LLAMA_API __declspec(dllimport)
 #        endif
 #    else
 #        define LLAMA_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define LLAMA_API
 #endif
 #ifdef __GNUC__
 #    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
 #elif defined(_MSC_VER)
 #    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
 #else
 #    define DEPRECATED(func, hint) func
 #endif
 #define LLAMA_DEFAULT_SEED           0xFFFFFFFF
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
    //
    // C interface
    //
    // TODO: show sample usage
    //
    struct llama_model;
    struct llama_context;
    typedef int llama_token;
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
        float p;        // probability of the token
    } llama_token_data;
    typedef struct llama_token_data_array {
        llama_token_data * data;
        size_t size;
        bool sorted;
    } llama_token_data_array;
    typedef void (*llama_progress_callback)(float progress, void *ctx);
    enum llama_log_level {
        LLAMA_LOG_LEVEL_ERROR = 2,
        LLAMA_LOG_LEVEL_WARN  = 3,
        LLAMA_LOG_LEVEL_INFO  = 4
    };
    // Signature for logging events
    // Note that text includes the new line character at the end for most events.
    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
    // if it exists.
    // It might not exist for progress report where '.' is output repeatedly.
    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
    struct llama_context_params {
        uint32_t seed;         // RNG seed, -1 for random
        int32_t  n_ctx;        // text context
        int32_t  n_batch;      // prompt processing batch size
        int32_t  n_gpu_layers; // number of layers to store in VRAM
        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;  // RoPE base frequency
        float    rope_freq_scale; // RoPE frequency scaling factor
        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
        // context pointer passed to the progress callback
        void * progress_callback_user_data;
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool low_vram;   // if true, reduce VRAM usage at the cost of performance
        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only
    };
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32              = 0,
        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
    };
    // model quantization parameters
    typedef struct llama_model_quantize_params {
        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
    } llama_model_quantize_params;
    // grammar types
    struct llama_grammar;
    // grammar element type
    enum llama_gretype {
        // end of rule definition
        LLAMA_GRETYPE_END            = 0,
        // start of alternate definition for rule
        LLAMA_GRETYPE_ALT            = 1,
        // non-terminal element: reference to rule
        LLAMA_GRETYPE_RULE_REF       = 2,
        // terminal element: character (code point)
        LLAMA_GRETYPE_CHAR           = 3,
        // inverse char(s) ([^a], [^a-b] [^abc])
        LLAMA_GRETYPE_CHAR_NOT       = 4,
        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
        // be an inclusive range ([a-z])
        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
        // modifies a preceding LLAMA_GRETYPE_CHAR or
        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
        LLAMA_GRETYPE_CHAR_ALT       = 6,
    };
    typedef struct llama_grammar_element {
        enum llama_gretype type;
        uint32_t           value; // Unicode code point or rule ID
    } llama_grammar_element;
    // performance timing information
    struct llama_timings {
        double t_start_ms;
        double t_end_ms;
        double t_load_ms;
        double t_sample_ms;
        double t_p_eval_ms;
        double t_eval_ms;
        int32_t n_sample;
        int32_t n_p_eval;
        int32_t n_eval;
    };
    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
    LLAMA_API int llama_max_devices();
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();
    // TODO: not great API - very likely to change
    // Initialize the llama + ggml backend
    // If numa is true, use NUMA optimizations
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(bool numa);
    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free();
    LLAMA_API int64_t llama_time_us();
    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
            struct llama_context_params   params);
    LLAMA_API void llama_free_model(struct llama_model * model);
    LLAMA_API struct llama_context * llama_new_context_with_model(
                     struct llama_model * model,
            struct llama_context_params   params);
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
            const llama_model_quantize_params * params);
    // Apply a LoRA adapter to a loaded model
    // path_base_model is the path to a higher quality model to use as a base for
    // the layers modified by the adapter. Can be NULL to use the current loaded model.
    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
    // will be applied on top of the previous one
    // Returns 0 on success
    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                      const char * path_base_model,
                             int   n_threads),
            "please use llama_model_apply_lora_from_file instead");
    LLAMA_API int llama_model_apply_lora_from_file(
            const struct llama_model * model,
                      const char * path_lora,
                      const char * path_base_model,
                             int   n_threads);
    // Returns the number of tokens in the KV cache
    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
    // Sets the current rng seed.
    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
    // Returns the maximum size in bytes of the state (rng, logits, embedding
    // and kv_cache) - will often be smaller after compacting tokens
    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
    // Set the state reading from the specified address
    // Returns the number of bytes read
    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
    // Save/load session file
    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
    // n_past is the number of tokens to use from previous eval calls
    // Returns 0 on success
    LLAMA_API int llama_eval(
            struct llama_context * ctx,
               const llama_token * tokens,
                             int   n_tokens,
                             int   n_past,
                             int   n_threads);
    // Same as llama_eval, but use float matrix input directly.
    LLAMA_API int llama_eval_embd(
            struct llama_context * ctx,
                     const float * embd,
                             int   n_tokens,
                             int   n_past,
                             int   n_threads);
    // Export a static computation graph for context of 511 and batch size of 1
    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
    //       parameters here to keep things simple
    // IMPORTANT: do not use for anything else other than debugging and testing!
    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
    // Returns a negative number on failure - the number of tokens that would have been returned
    // TODO: not sure if correct
    LLAMA_API int llama_tokenize(
            struct llama_context * ctx,
                      const char * text,
                     llama_token * tokens,
                             int   n_max_tokens,
                            bool   add_bos);
    LLAMA_API int llama_tokenize_bpe(
            struct llama_context * ctx,
                      const char * text,
                     llama_token * tokens,
                             int   n_max_tokens,
                            bool   add_bos);
    LLAMA_API int llama_tokenize_with_model(
        const struct llama_model * model,
                      const char * text,
                     llama_token * tokens,
                             int   n_max_tokens,
                            bool   add_bos);
    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
    // Get the vocabulary as output parameters.
    // Returns number of results.
    LLAMA_API int llama_get_vocab(
            const struct llama_context * ctx,
                          const char * * strings,
                                 float * scores,
                                   int   capacity);
    LLAMA_API int llama_get_vocab_from_model(
              const struct llama_model * model,
                          const char * * strings,
                                 float * scores,
                                   int   capacity);
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
    // Rows: n_tokens
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
    // Get the embeddings for the input
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
    // Token Id -> String. Uses the vocabulary in the provided context
    LLAMA_API int llama_token_to_str(
            const struct llama_context * ctx,
                           llama_token   token,
                                  char * str,
                                  int    length);
    LLAMA_API int llama_token_to_str_bpe(
            const struct llama_context * ctx,
                           llama_token   token,
                                  char * str,
                                  int    length);
    LLAMA_API int llama_token_to_str_with_model(
              const struct llama_model * model,
                           llama_token   token,
                                  char * str,
                                  int    length);
    // Special tokens
    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
    LLAMA_API llama_token llama_token_nl();   // next-line
    // Grammar
    //
    LLAMA_API struct llama_grammar * llama_grammar_init(
            const llama_grammar_element ** rules,
                                 size_t    n_rules,
                                 size_t    start_rule_index);
    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
    // Sampling functions
    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
    LLAMA_API void llama_sample_classifier_free_guidance(
              struct llama_context * ctx,
            llama_token_data_array * candidates,
              struct llama_context * guidance_ctx,
                             float   scale);
    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
    /// @details Apply constraints from grammar
    LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
    /// @details Selects the token with the highest probability.
    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Randomly selects a token from the candidates based on their probabilities.
    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Accepts the sampled token into the grammar
    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
 #ifdef __cplusplus
 }
 #endif
 // C++ API, will be moving to common.h soon (TM)
 #ifdef LLAMA_API_CPP
 #include <vector>
 #include <string>
 //
 // Vocab utils
 //
 std::vector<llama_token> llama_tokenize(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos);
 std::vector<llama_token> llama_tokenize_bpe(
        struct llama_context * ctx,
           const std::string & text,
                        bool   add_bos);
 std::string llama_token_to_str(
        const struct llama_context * ctx,
                       llama_token   token);
 std::string llama_token_to_str_bpe(
    const struct llama_context * ctx,
                   llama_token   token);
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 struct ggml_tensor;
 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif // LLAMA_API_CPP
 #endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H
--- a/gguf-util.h
+++ b/gguf-util.h
@ -1,470 +0,0 @@
 // GGUF counterpart of llama-util.h.
 // we may consider making it a part of ggml.c once GGUF work is complete.
 // this will require extra work to migrate this to pure C.
 // Contains wrappers around OS interfaces.
 #ifndef GGUF_UTIL_H
 #define GGUF_UTIL_H
 #include "ggml.h"
 #include <cstdio>
 #include <cstdint>
 #include <cerrno>
 #include <cstring>
 #include <cstdarg>
 #include <cstdlib>
 #include <climits>
 #include <string>
 #include <sstream>
 #include <vector>
 #include <stdexcept>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h> // for _fseeki64
 #endif
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
 #else
 __attribute__((format(printf, 1, 2)))
 #endif
 #endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
    va_copy(ap2, ap);
    int size = vsnprintf(NULL, 0, fmt, ap);
    GGML_ASSERT(size >= 0 && size < INT_MAX);
    std::vector<char> buf(size + 1);
    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
    GGML_ASSERT(size2 == size);
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
 }
 // TODO: can we merge this one and gguf_context?
 struct gguf_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    size_t size;
    gguf_file(const char * fname, const char * mode) {
        fp = std::fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
        seek(0, SEEK_SET);
    }
    size_t tell() const {
 #ifdef _WIN32
        __int64 ret = _ftelli64(fp);
 #else
        long ret = std::ftell(fp);
 #endif
        GGML_ASSERT(ret != -1); // this really shouldn't fail
        return (size_t) ret;
    }
    void seek(size_t offset, int whence) {
 #ifdef _WIN32
        int ret = _fseeki64(fp, (__int64) offset, whence);
 #else
        int ret = std::fseek(fp, (long) offset, whence);
 #endif
        GGML_ASSERT(ret == 0); // same
    }
    size_t write_str(const std::string & val) {
        size_t total_written = 0;
        const int32_t n = val.size();
        fwrite((const char *) &n, sizeof(n), 1, fp);
        total_written += sizeof(n);
        fwrite(val.c_str(), n, 1, fp);
        total_written += n;
        return total_written;
    }
    size_t write_i32(int32_t val) {
        fwrite((const char *) &val, sizeof(val), 1, fp);
        return sizeof(val);
    }
    size_t write_u64(size_t val) {
        fwrite((const char *) &val, sizeof(val), 1, fp);
        return sizeof(val);
    }
    template<typename T>
    void write_val(const std::string & key, enum gguf_type type, const T & val) {
        write_str(key);
        fwrite((const char *) &type, sizeof(type), 1, fp);
        fwrite((const char *) &val, sizeof(val), 1, fp);
    }
    template<typename T>
    void write_arr(const std::string & key, enum gguf_type type, const std::vector<T> & val) {
        write_str(key);
        {
            const enum gguf_type tarr = GGUF_TYPE_ARRAY;
            fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
        }
        const int32_t n = val.size();
        fwrite((const char *) &type, sizeof(type), 1, fp);
        fwrite((const char *) &n, sizeof(n), 1, fp);
        fwrite(val.data(), sizeof(T), n, fp);
    }
    void write_str(const std::string & key, enum gguf_type type, const std::string & val) {
        write_str(key);
        fwrite((const char *) &type, sizeof(type), 1, fp);
        const int32_t n = val.size();
        fwrite((const char *) &n, sizeof(n), 1, fp);
        fwrite(val.c_str(), n, 1, fp);
    }
    void write_str(const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
        write_str(key);
        {
            const enum gguf_type tarr = GGUF_TYPE_ARRAY;
            fwrite((const char *) &tarr, sizeof(tarr), 1, fp);
        }
        const int32_t n = val.size();
        fwrite((const char *) &type, sizeof(type), 1, fp);
        fwrite((const char *) &n, sizeof(n), 1, fp);
        for (int i = 0; i < n; ++i) {
            const int32_t nstr = val[i].size();
            fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
            fwrite(val[i].c_str(), nstr, 1, fp);
        }
    }
    void write_zeros(size_t count) {
        for (size_t i = 0; i < count; ++i) {
            fputc(0, fp);
        }
    }
    void read_raw(void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
        std::size_t ret = std::fread(ptr, len, 1, fp);
        if (ferror(fp)) {
            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
        if (ret != 1) {
            throw std::runtime_error(std::string("unexpectedly reached end of file"));
        }
    }
    void write_raw(const void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
        size_t ret = std::fwrite(ptr, len, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
    }
    ~gguf_file() {
        if (fp) {
            std::fclose(fp);
        }
    }
 };
 #if defined(_WIN32)
 static std::string gguf_format_win_err(DWORD err) {
    LPSTR buf;
    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
    if (!size) {
        return "FormatMessageA failed";
    }
    std::string ret(buf, size);
    LocalFree(buf);
    return ret;
 }
 #endif
 struct gguf_mmap {
    void * addr;
    size_t size;
    gguf_mmap(const gguf_mmap &) = delete;
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;
    gguf_mmap(struct gguf_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
        // prefetch/readahead impairs performance on NUMA systems
        if (numa) { prefetch = 0; }
 #ifdef __linux__
        if (prefetch) { flags |= MAP_POPULATE; }
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        if (addr == MAP_FAILED) {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }
        if (prefetch > 0) {
            // Advise the kernel to preload the mapped memory
            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
        if (numa) {
            // advise the kernel not to use readahead
            // (because the next page might not belong on the same node)
            if (madvise(addr, file->size, MADV_RANDOM)) {
                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
                        strerror(errno));
            }
        }
    }
    ~gguf_mmap() {
        munmap(addr, size);
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
    gguf_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
        (void) numa;
        size = file->size;
        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
        if (hMapping == NULL) {
            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
        }
        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
        error = GetLastError();
        CloseHandle(hMapping);
        if (addr == NULL) {
            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
        }
        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
        if (prefetch) {
            // Advise the kernel to preload the mapped memory
            WIN32_MEMORY_RANGE_ENTRY range;
            range.VirtualAddress = addr;
            range.NumberOfBytes = (SIZE_T)size;
            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
                        gguf_format_win_err(GetLastError()).c_str());
            }
        }
        #else
        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
    }
    ~gguf_mmap() {
        if (!UnmapViewOfFile(addr)) {
            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
    }
 #else
    static constexpr bool SUPPORTED = false;
    gguf_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
        (void) prefetch;
        (void) numa;
        throw std::runtime_error(std::string("mmap not supported"));
    }
 #endif
 };
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
 struct gguf_mlock {
    void * addr = NULL;
    size_t size = 0;
    bool failed_already = false;
    gguf_mlock() {}
    gguf_mlock(const gguf_mlock &) = delete;
    ~gguf_mlock() {
        if (size) {
            raw_unlock(addr, size);
        }
    }
    void init(void * ptr) {
        GGML_ASSERT(addr == NULL && size == 0);
        addr = ptr;
    }
    void grow_to(size_t target_size) {
        GGML_ASSERT(addr);
        if (failed_already) {
            return;
        }
        size_t granularity = lock_granularity();
        target_size = (target_size + granularity - 1) & ~(granularity - 1);
        if (target_size > size) {
            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
                size = target_size;
            } else {
                failed_already = true;
            }
        }
    }
 #ifdef _POSIX_MEMLOCK_RANGE
    static constexpr bool SUPPORTED = true;
    size_t lock_granularity() {
        return (size_t) sysconf(_SC_PAGESIZE);
    }
    #ifdef __APPLE__
        #define MLOCK_SUGGESTION \
            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
    #else
        #define MLOCK_SUGGESTION \
            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
    #endif
    bool raw_lock(const void * addr, size_t size) {
        if (!mlock(addr, size)) {
            return true;
        } else {
            char* errmsg = std::strerror(errno);
            bool suggest = (errno == ENOMEM);
            // Check if the resource limit is fine after all
            struct rlimit lock_limit;
            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
                suggest = false;
            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
                suggest = false;
            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
    #undef MLOCK_SUGGESTION
    void raw_unlock(void * addr, size_t size) {
        if (munlock(addr, size)) {
            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
        }
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
    size_t lock_granularity() {
        SYSTEM_INFO si;
        GetSystemInfo(&si);
        return (size_t) si.dwPageSize;
    }
    bool raw_lock(void * ptr, size_t len) {
        for (int tries = 1; ; tries++) {
            if (VirtualLock(ptr, len)) {
                return true;
            }
            if (tries == 2) {
                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
                    len, size, llama_format_win_err(GetLastError()).c_str());
                return false;
            }
            // It failed but this was only the first try; increase the working
            // set size and try again.
            SIZE_T min_ws_size, max_ws_size;
            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
                        gguf_format_win_err(GetLastError()).c_str());
                return false;
            }
            // Per MSDN: "The maximum number of pages that a process can lock
            // is equal to the number of pages in its minimum working set minus
            // a small overhead."
            // Hopefully a megabyte is enough overhead:
            size_t increment = len + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
            min_ws_size += increment;
            max_ws_size += increment;
            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
                        gguf_format_win_err(GetLastError()).c_str());
                return false;
            }
        }
    }
    void raw_unlock(void * ptr, size_t len) {
        if (!VirtualUnlock(ptr, len)) {
            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
                    gguf_format_win_err(GetLastError()).c_str());
        }
    }
 #else
    static constexpr bool SUPPORTED = false;
    size_t lock_granularity() {
        return (size_t) 65536;
    }
    bool raw_lock(const void * addr, size_t len) {
        fprintf(stderr, "warning: mlock not supported on this system\n");
        return false;
    }
    void raw_unlock(const void * addr, size_t len) {}
 #endif
 };
 #endif
--- a/gguf.py
+++ b/gguf.py
@ -4,14 +4,169 @@
 3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
 """
 import sys
 import struct
-import constants
+import numpy as np
 from enum import IntEnum
 from typing import Any, IO, List
-import numpy as np
+#
-import sys
+# constants
 #
 GGUF_MAGIC             = 0x47475546
 GGUF_VERSION           = 1
 GGUF_DEFAULT_ALIGNMENT = 32
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
 KEY_GENERAL_ALIGNMENT            = "general.alignment"
 KEY_GENERAL_NAME                 = "general.name"
 KEY_GENERAL_AUTHOR               = "general.author"
 KEY_GENERAL_URL                  = "general.url"
 KEY_GENERAL_DESCRIPTION          = "general.description"
 KEY_GENERAL_FILE_TYPE            = "general.file_type"
 KEY_GENERAL_LICENSE              = "general.license"
 KEY_GENERAL_SOURCE_URL           = "general.source.url"
 KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
 # LLM
 KEY_LLM_CONTEXT_LENGTH           = "{llm}.context_length"
 KEY_LLM_EMBEDDING_LENGTH         = "{llm}.embedding_length"
 KEY_LLM_BLOCK_COUNT              = "{llm}.block_count"
 KEY_LLM_FEED_FORWARD_LENGTH      = "{llm}.feed_forward_length"
 KEY_LLM_USE_PARALLEL_RESIDUAL    = "{llm}.use_parallel_residual"
 KEY_LLM_TENSOR_DATA_LAYOUT       = "{llm}.tensor_data_layout"
 # attention
 KEY_ATTENTION_HEAD_COUNT         = "{llm}.attention.head_count"
 KEY_ATTENTION_HEAD_COUNT_KV      = "{llm}.attention.head_count_kv"
 KEY_ATTENTION_MAX_ALIBI_BIAS     = "{llm}.attention.max_alibi_bias"
 KEY_ATTENTION_CLAMP_KQV          = "{llm}.attention.clamp_kqv"
 KEY_ATTENTION_LAYERNORM_EPS      = "{llm}.attention.layer_norm_epsilon"
 KEY_ATTENTION_LAYERNORM_RMS_EPS  = "{llm}.attention.layer_norm_rms_epsilon"
 # RoPE
 KEY_ROPE_DIMENSION_COUNT         = "{llm}.rope.dimension_count"
 KEY_ROPE_SCALE                   = "{llm}.rope.scale"
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
 KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
 KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
 KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
 KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
 KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
 KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
 KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
 KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
 KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
 KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
 KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
 #
 # recommended mapping of model tensor names for storage in gguf
 #
 def get_tensor_name_map(n_blocks : int):
    tensor_map = {}
    # Token embeddings
    mapped_to = "token_embd"
    tensor_map["gpt_neox.embed_in"]           = mapped_to # gptneox
    tensor_map["transformer.wte"]             = mapped_to # gpt2 mpt
    tensor_map["transformer.word_embeddings"] = mapped_to # falcon
    tensor_map["model.embed_tokens"]          = mapped_to # llama-hf
    tensor_map["tok_embeddings"]              = mapped_to # llama-pth
    # Position embeddings
    mapped_to = "pos_embd"
    tensor_map["transformer.wpe"] = mapped_to # gpt2
    # Output norm
    mapped_to = "output_norm"
    tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
    tensor_map["transformer.ln_f"]          = mapped_to # gpt2 falcon
    tensor_map["transformer.norm_f"]        = mapped_to # mpt
    tensor_map["model.norm"]                = mapped_to # llama-hf
    tensor_map["norm"]                      = mapped_to # llama-pth
    # Output
    mapped_to = "output"
    tensor_map["embed_out"] = mapped_to # gptneox
    tensor_map["lm_head"]   = mapped_to # gpt2 mpt falcon llama-hf
    tensor_map["output"]    = mapped_to # llama-pth
    # Attention and fee-forward layer blocks
    for i in range(0,n_blocks):
        # Attention norm
        mapped_to = "blk."+str(i)+".attn_norm"
        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to # falcon7b
        tensor_map["transformer.h."+str(i)+".ln_attn"]           = mapped_to # falcon40b
        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to # llama-pth
        # Attention norm 2
        mapped_to = "blk."+str(i)+".attn_norm_2"
        tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
        # Attention query-key-value
        mapped_to = "blk."+str(i)+".attn_qkv"
        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
        # Attention query
        mapped_to = "blk."+str(i)+".attn_q"
        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to # llama-pth
        # Attention key
        mapped_to = "blk."+str(i)+".attn_k"
        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to # llama-pth
        # Attention value
        mapped_to = "blk."+str(i)+".attn_v"
        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to # llama-pth
        # Attention output
        mapped_to = "blk."+str(i)+".attn_output"
        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to # llama-pth
        # Feed-forward norm
        mapped_to = "blk."+str(i)+".ffn_norm"
        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to # mpt
        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to # llama-pth
        # Feed-forward up
        mapped_to = "blk."+str(i)+".ffn_up"
        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to # falcon
        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to # llama-pth
        # Feed-forward gate
        mapped_to = "blk."+str(i)+".ffn_gate"
        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to # llama-pth
        # Feed-forward down
        mapped_to = "blk."+str(i)+".ffn_down"
        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to # falcon
        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to # llama-pth
    return tensor_map
 #
 # implementation
 #
 class GGMLQuantizationType(IntEnum):
    F32 = 0
@ -51,15 +206,15 @@ class GGUFWriter:
    def __init__(self, fout: IO):
        self.fout = fout
        self.offset_tensor = 0
-        self.data_alignment = constants.GGUF_DEFAULT_ALIGNMENT
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
        self.kv_data = b""
        self.kv_data_count = 0
        self.ti_data = b""
        self.ti_data_count = 0
    def write_header_to_file(self):
-        self.fout.write(struct.pack("<I", constants.GGUF_MAGIC))
+        self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack("<I", constants.GGUF_VERSION))
+        self.fout.write(struct.pack("<I", GGUF_VERSION))
        self.fout.write(struct.pack("<I", self.ti_data_count))
        self.fout.write(struct.pack("<I", self.kv_data_count))
        self.flush()
@ -201,123 +356,125 @@ class GGUFWriter:
        self.fout.close()
    def add_architecture(self, architecture: str):
-        self.add_string(constants.KEY_GENERAL_ARCHITECTURE,
+        self.add_string(KEY_GENERAL_ARCHITECTURE,
                        architecture)
    def add_author(self, author: str):
-        self.add_string(constants.KEY_GENERAL_AUTHOR, author)
+        self.add_string(KEY_GENERAL_AUTHOR, author)
    def add_tensor_data_layout(self, layout: str):
        self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT , layout)
    def add_url(self, url: str):
-        self.add_string(constants.KEY_GENERAL_URL, url)
+        self.add_string(KEY_GENERAL_URL, url)
    def add_description(self, description: str):
-        self.add_string(constants.KEY_GENERAL_DESCRIPTION, description)
+        self.add_string(KEY_GENERAL_DESCRIPTION, description)
    def add_file_type(self, file_type: str):
-        self.add_string(constants.KEY_GENERAL_FILE_TYPE, file_type)
+        self.add_string(KEY_GENERAL_FILE_TYPE, file_type)
    def add_source_url(self, url: str):
-        self.add_string(constants.KEY_GENERAL_SOURCE_URL, url)
+        self.add_string(KEY_GENERAL_SOURCE_URL, url)
    def add_source_hf_repo(self, repo: str):
-        self.add_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo)
+        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
    def add_name(self, name: str):
-        self.add_string(constants.KEY_GENERAL_NAME, name)
+        self.add_string(KEY_GENERAL_NAME, name)
    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
        self.add_uint32(
-            constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
+            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
    def add_custom_alignment(self, alignment: int):
        self.data_alignment = alignment
-        self.add_uint32(constants.KEY_GENERAL_ALIGNMENT, alignment)
+        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
    def add_context_length(self, llm: str, length: int):
        self.add_uint32(
-            constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
+            KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
    def add_embedding_length(self, llm: str, length: int):
        self.add_uint32(
-            constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
+            KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
    def add_block_count(self, llm: str, length: int):
        self.add_uint32(
-            constants.KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
+            KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
    def add_feed_forward_length(self, llm: str, length: int):
        self.add_uint32(
-            constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
+            KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
    def add_parallel_residual(self, llm: str, use: bool):
        self.add_bool(
-            constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
+            KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
    def add_tensor_data_layout(self, llm: str, layout: str):
        self.add_string(
-            constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
+            KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
    def add_head_count(self, llm: str, count: int):
        self.add_uint32(
-            constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
+            KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
    def add_head_count_kv(self, llm: str, count: int):
        self.add_uint32(
-            constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
+            KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
    def add_max_alibi_bias(self, llm: str, bias: float):
        self.add_float32(
-            constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
+            KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
    def add_clamp_kqv(self, llm: str, value: float):
        self.add_float32(
-            constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
+            KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
    def add_layer_norm_eps(self, llm: str, value: float):
        self.add_float32(
-            constants.KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
+            KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
    def add_layer_norm_rms_eps(self, llm: str, value: float):
        self.add_float32(
-            constants.KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
+            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
    def add_rope_dimension_count(self, llm: str, count: int):
        self.add_uint32(
-            constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
+            KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
    def add_rope_scale(self, llm: str, value:  float):
-        self.add_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value)
+        self.add_float32(KEY_ROPE_SCALE.format(llm=llm), value)
    def add_tokenizer_model(self, model: str):
-        self.add_string(constants.KEY_TOKENIZER_MODEL, model)
+        self.add_string(KEY_TOKENIZER_MODEL, model)
    def add_token_list(self, tokens: List):
-        self.add_array(constants.KEY_TOKENIZER_LIST, tokens)
+        self.add_array(KEY_TOKENIZER_LIST, tokens)
    def add_token_merges(self, merges: List):
-        self.add_array(constants.KEY_TOKENIZER_MERGES, merges)
+        self.add_array(KEY_TOKENIZER_MERGES, merges)
    def add_token_types(self, types: List[int]):
-        self.add_array(constants.KEY_TOKENIZER_TOKEN_TYPE, types)
+        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
    def add_token_scores(self, scores: List[float]):
-        self.add_array(constants.KEY_TOKENIZER_SCORES, scores)
+        self.add_array(KEY_TOKENIZER_SCORES, scores)
    def add_bos_token_id(self, id: int):
-        self.add_uint32(constants.KEY_TOKENIZER_BOS_ID, id)
+        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
    def add_eos_token_id(self, id: int):
-        self.add_uint32(constants.KEY_TOKENIZER_EOS_ID, id)
+        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
    def add_unk_token_id(self, id: int):
-        self.add_uint32(constants.KEY_TOKENIZER_UNK_ID, id)
+        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
    def add_sep_token_id(self, id: int):
-        self.add_uint32(constants.KEY_TOKENIZER_SEP_ID, id)
+        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
    def add_pad_token_id(self, id: int):
-        self.add_uint32(constants.KEY_TOKENIZER_PAD_ID, id)
+        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
 # Example usage:
 if __name__ == "__main__":
--- a/gguf_namemap.py
+++ b/gguf_namemap.py
@ -1,95 +0,0 @@
 # Recommended mapping of model tensor names for storage in gguf
 def get_tensor_namemap( n_blocks : int):
    tensor_map = {}
    # Token embeddings
    mapped_to = "token_embd"
    tensor_map["gpt_neox.embed_in"]           = mapped_to # gptneox
    tensor_map["transformer.wte"]             = mapped_to # gpt2 mpt
    tensor_map["transformer.word_embeddings"] = mapped_to # falcon
    tensor_map["model.embed_tokens"]          = mapped_to # llama-hf
    tensor_map["tok_embeddings"]              = mapped_to # llama-pth
    # Position embeddings
    mapped_to = "pos_embd"
    tensor_map["transformer.wpe"] = mapped_to # gpt2
    # Output norm
    mapped_to = "output_norm"
    tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
    tensor_map["transformer.ln_f"]          = mapped_to # gpt2 falcon
    tensor_map["transformer.norm_f"]        = mapped_to # mpt
    tensor_map["model.norm"]                = mapped_to # llama-hf
    tensor_map["norm"]                      = mapped_to # llama-pth
    # Output
    mapped_to = "output"
    tensor_map["embed_out"] = mapped_to # gptneox
    tensor_map["lm_head"]   = mapped_to # gpt2 mpt falcon llama-hf
    tensor_map["output"]    = mapped_to # llama-pth
    # Attention and fee-forward layer blocks
    for i in range(0,n_blocks):
        # Attention norm
        mapped_to = "blk."+str(i)+".attn_norm"
        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to # falcon7b
        tensor_map["transformer.h."+str(i)+".ln_attn"]           = mapped_to # falcon40b
        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to # llama-pth
        # Attention norm 2
        mapped_to = "blk."+str(i)+".attn_norm_2"
        tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
        # Attention query-key-value
        mapped_to = "blk."+str(i)+".attn_qkv"
        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
        # Attention query
        mapped_to = "blk."+str(i)+".attn_q"
        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to # llama-pth
        # Attention key
        mapped_to = "blk."+str(i)+".attn_k"
        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to # llama-pth
        # Attention value
        mapped_to = "blk."+str(i)+".attn_v"
        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to # llama-pth
        # Attention output
        mapped_to = "blk."+str(i)+".attn_output"
        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to # llama-pth
        # Feed-forward norm
        mapped_to = "blk."+str(i)+".ffn_norm"
        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to # mpt
        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to # llama-pth
        # Feed-forward up
        mapped_to = "blk."+str(i)+".ffn_up"
        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to # falcon
        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to # llama-pth
        # Feed-forward gate
        mapped_to = "blk."+str(i)+".ffn_gate"
        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to # llama-pth
        # Feed-forward down
        mapped_to = "blk."+str(i)+".ffn_down"
        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to # gpt2
        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to # mpt
        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to # falcon
        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to # llama-hf
        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to # llama-pth
    return tensor_map
--- a/gptneox-main.cpp
+++ b/gptneox-main.cpp
@ -381,6 +381,8 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        if (keyidx != -1) { fprintf(stdout, "%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
        if (keyidx != -1) { fprintf(stdout, "%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
        if (keyidx != -1) { fprintf(stdout, "%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
        if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }
--- a/llama-util.h
+++ b/llama-util.h
@ -1,553 +0,0 @@
 // Internal header to be included only by llama.cpp.
 // Contains wrappers around OS interfaces.
 #ifndef LLAMA_UTIL_H
 #define LLAMA_UTIL_H
 #include <cstdio>
 #include <cstdint>
 #include <cerrno>
 #include <cstring>
 #include <cstdarg>
 #include <cstdlib>
 #include <climits>
 #include <string>
 #include <vector>
 #include <stdexcept>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h> // for _fseeki64
 #endif
 #define LLAMA_ASSERT(x) \
    do { \
        if (!(x)) { \
            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
            abort(); \
        } \
    } while (0)
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
 #else
 __attribute__((format(printf, 1, 2)))
 #endif
 #endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
    va_copy(ap2, ap);
    int size = vsnprintf(NULL, 0, fmt, ap);
    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
    std::vector<char> buf(size + 1);
    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
    LLAMA_ASSERT(size2 == size);
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
 }
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
    size_t size;
    llama_file(const char * fname, const char * mode) {
        fp = std::fopen(fname, mode);
        if (fp == NULL) {
            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
        seek(0, SEEK_SET);
    }
    size_t tell() const {
 #ifdef _WIN32
        __int64 ret = _ftelli64(fp);
 #else
        long ret = std::ftell(fp);
 #endif
        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
        return (size_t) ret;
    }
    void seek(size_t offset, int whence) {
 #ifdef _WIN32
        int ret = _fseeki64(fp, (__int64) offset, whence);
 #else
        int ret = std::fseek(fp, (long) offset, whence);
 #endif
        LLAMA_ASSERT(ret == 0); // same
    }
    void read_raw(void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
        std::size_t ret = std::fread(ptr, len, 1, fp);
        if (ferror(fp)) {
            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
        if (ret != 1) {
            throw std::runtime_error(std::string("unexpectedly reached end of file"));
        }
    }
    std::uint32_t read_u32() {
        std::uint32_t ret;
        read_raw(&ret, sizeof(ret));
        return ret;
    }
    std::string read_string(std::uint32_t len) {
        std::vector<char> chars(len);
        read_raw(chars.data(), len);
        return std::string(chars.data(), len);
    }
    void write_raw(const void * ptr, size_t len) const {
        if (len == 0) {
            return;
        }
        errno = 0;
        size_t ret = std::fwrite(ptr, len, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
    }
    void write_u32(std::uint32_t val) {
        write_raw(&val, sizeof(val));
    }
    ~llama_file() {
        if (fp) {
            std::fclose(fp);
        }
    }
 };
 // llama_context_data
 struct llama_data_context {
    virtual void write(const void * src, size_t size) = 0;
    virtual size_t get_size_written() = 0;
    virtual ~llama_data_context() = default;
 };
 struct llama_data_buffer_context : llama_data_context {
    uint8_t* ptr;
    size_t size_written = 0;
    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
    void write(const void * src, size_t size) override {
        memcpy(ptr, src, size);
        ptr += size;
        size_written += size;
    }
    size_t get_size_written() override {
        return size_written;
    }
 };
 struct llama_data_file_context : llama_data_context {
    llama_file* file;
    size_t size_written = 0;
    llama_data_file_context(llama_file * f) : file(f) {}
    void write(const void * src, size_t size) override {
        file->write_raw(src, size);
        size_written += size;
    }
    size_t get_size_written() override {
        return size_written;
    }
 };
 #if defined(_WIN32)
 static std::string llama_format_win_err(DWORD err) {
    LPSTR buf;
    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
    if (!size) {
        return "FormatMessageA failed";
    }
    std::string ret(buf, size);
    LocalFree(buf);
    return ret;
 }
 #endif
 struct llama_mmap {
    void * addr;
    size_t size;
    llama_mmap(const llama_mmap &) = delete;
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;
    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
        // prefetch/readahead impairs performance on NUMA systems
        if (numa) { prefetch = 0; }
 #ifdef __linux__
        if (prefetch >= file->size) { flags |= MAP_POPULATE; }
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        if (addr == MAP_FAILED) {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }
        if (prefetch > 0) {
            // Advise the kernel to preload the mapped memory
            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
        if (numa) {
            // advise the kernel not to use readahead
            // (because the next page might not belong on the same node)
            if (madvise(addr, file->size, MADV_RANDOM)) {
                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
                        strerror(errno));
            }
        }
    }
    ~llama_mmap() {
        munmap(addr, size);
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
        (void) numa;
        size = file->size;
        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
        if (hMapping == NULL) {
            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
        }
        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
        error = GetLastError();
        CloseHandle(hMapping);
        if (addr == NULL) {
            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
        }
        if (prefetch) {
            // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
            // will dynamically load it using GetProcAddress.
            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
            HMODULE hKernel32;
            // This call is guaranteed to succeed.
            hKernel32 = GetModuleHandleW(L"kernel32.dll");
            // This call may fail if on a pre-Win8 system.
            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
            if (pPrefetchVirtualMemory) {
                // Advise the kernel to preload the mapped memory.
                WIN32_MEMORY_RANGE_ENTRY range;
                range.VirtualAddress = addr;
                range.NumberOfBytes = (SIZE_T)size;
                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
                            llama_format_win_err(GetLastError()).c_str());
                }
            }
        }
    }
    ~llama_mmap() {
        if (!UnmapViewOfFile(addr)) {
            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
    }
 #else
    static constexpr bool SUPPORTED = false;
    llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
        (void) prefetch;
        (void) numa;
        throw std::runtime_error(std::string("mmap not supported"));
    }
 #endif
 };
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
 struct llama_mlock {
    void * addr = NULL;
    size_t size = 0;
    bool failed_already = false;
    llama_mlock() {}
    llama_mlock(const llama_mlock &) = delete;
    ~llama_mlock() {
        if (size) {
            raw_unlock(addr, size);
        }
    }
    void init(void * ptr) {
        LLAMA_ASSERT(addr == NULL && size == 0);
        addr = ptr;
    }
    void grow_to(size_t target_size) {
        LLAMA_ASSERT(addr);
        if (failed_already) {
            return;
        }
        size_t granularity = lock_granularity();
        target_size = (target_size + granularity - 1) & ~(granularity - 1);
        if (target_size > size) {
            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
                size = target_size;
            } else {
                failed_already = true;
            }
        }
    }
 #ifdef _POSIX_MEMLOCK_RANGE
    static constexpr bool SUPPORTED = true;
    size_t lock_granularity() {
        return (size_t) sysconf(_SC_PAGESIZE);
    }
    #ifdef __APPLE__
        #define MLOCK_SUGGESTION \
            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
    #else
        #define MLOCK_SUGGESTION \
            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
    #endif
    bool raw_lock(const void * addr, size_t size) {
        if (!mlock(addr, size)) {
            return true;
        } else {
            char* errmsg = std::strerror(errno);
            bool suggest = (errno == ENOMEM);
            // Check if the resource limit is fine after all
            struct rlimit lock_limit;
            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
                suggest = false;
            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
                suggest = false;
            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
    #undef MLOCK_SUGGESTION
    void raw_unlock(void * addr, size_t size) {
        if (munlock(addr, size)) {
            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
        }
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
    size_t lock_granularity() {
        SYSTEM_INFO si;
        GetSystemInfo(&si);
        return (size_t) si.dwPageSize;
    }
    bool raw_lock(void * ptr, size_t len) {
        for (int tries = 1; ; tries++) {
            if (VirtualLock(ptr, len)) {
                return true;
            }
            if (tries == 2) {
                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
                    len, size, llama_format_win_err(GetLastError()).c_str());
                return false;
            }
            // It failed but this was only the first try; increase the working
            // set size and try again.
            SIZE_T min_ws_size, max_ws_size;
            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
                return false;
            }
            // Per MSDN: "The maximum number of pages that a process can lock
            // is equal to the number of pages in its minimum working set minus
            // a small overhead."
            // Hopefully a megabyte is enough overhead:
            size_t increment = len + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
            min_ws_size += increment;
            max_ws_size += increment;
            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
                return false;
            }
        }
    }
    void raw_unlock(void * ptr, size_t len) {
        if (!VirtualUnlock(ptr, len)) {
            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
    }
 #else
    static constexpr bool SUPPORTED = false;
    size_t lock_granularity() {
        return (size_t) 65536;
    }
    bool raw_lock(const void * addr, size_t len) {
        fprintf(stderr, "warning: mlock not supported on this system\n");
        return false;
    }
    void raw_unlock(const void * addr, size_t len) {}
 #endif
 };
 // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
 struct llama_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
    llama_buffer() = default;
    void resize(size_t len) {
 #ifdef GGML_USE_METAL
        free(addr);
        int result = posix_memalign((void **) &addr, getpagesize(), len);
        if (result == 0) {
            memset(addr, 0, len);
        }
        else {
            addr = NULL;
        }
 #else
        delete[] addr;
        addr = new uint8_t[len];
 #endif
        size = len;
    }
    ~llama_buffer() {
 #ifdef GGML_USE_METAL
        free(addr);
 #else
        delete[] addr;
 #endif
        addr = NULL;
    }
    // disable copy and move
    llama_buffer(const llama_buffer&) = delete;
    llama_buffer(llama_buffer&&) = delete;
    llama_buffer& operator=(const llama_buffer&) = delete;
    llama_buffer& operator=(llama_buffer&&) = delete;
 };
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
    uint8_t * addr = NULL;
    bool is_cuda;
    size_t size = 0;
    llama_ctx_buffer() = default;
    void resize(size_t size) {
        free();
        addr = (uint8_t *) ggml_cuda_host_malloc(size);
        if (addr) {
            is_cuda = true;
        }
        else {
            // fall back to pageable memory
            addr = new uint8_t[size];
            is_cuda = false;
        }
        this->size = size;
    }
    void free() {
        if (addr) {
            if (is_cuda) {
                ggml_cuda_host_free(addr);
            }
            else {
                delete[] addr;
            }
        }
        addr = NULL;
    }
    ~llama_ctx_buffer() {
        free();
    }
    // disable copy and move
    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
 };
 #else
 typedef llama_buffer llama_ctx_buffer;
 #endif
 #endif
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -34,29 +34,18 @@
 #    define DEPRECATED(func, hint) func
 #endif
-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+
 #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
 #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_FILE_VERSION           3
 #define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
 #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 1
 #define LLAMA_DEFAULT_SEED           0xFFFFFFFF
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
 #ifndef LLAMA_DEFAULT_RMS_EPS
 #define LLAMA_DEFAULT_RMS_EPS 5e-6f
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -103,8 +92,6 @@ extern "C" {
        uint32_t seed;         // RNG seed, -1 for random
        int32_t  n_ctx;        // text context
        int32_t  n_batch;      // prompt processing batch size
        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
        int32_t  n_gpu_layers; // number of layers to store in VRAM
        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
@ -129,6 +116,7 @@ extern "C" {
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only
    };
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32              = 0,
@ -208,17 +196,12 @@ extern "C" {
        int32_t n_eval;
    };
-    // Set callback for all future logging events.
+    LLAMA_API struct llama_context_params llama_context_default_params(void);
-    // If this is not called, or NULL is supplied, everything is output on stderr.
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
-    LLAMA_API int llama_max_devices();
+    LLAMA_API int  llama_max_devices(void);
-
+    LLAMA_API bool llama_mmap_supported(void);
-    LLAMA_API struct llama_context_params llama_context_default_params();
+    LLAMA_API bool llama_mlock_supported(void);
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();
    // TODO: not great API - very likely to change
    // Initialize the llama + ggml backend
@ -226,9 +209,9 @@ extern "C" {
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(bool numa);
    // Call once at the end of the program - currently only used for MPI
-    LLAMA_API void llama_backend_free();
+    LLAMA_API void llama_backend_free(void);
-    LLAMA_API int64_t llama_time_us();
+    LLAMA_API int64_t llama_time_us(void);
    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
@ -240,13 +223,6 @@ extern "C" {
                     struct llama_model * model,
            struct llama_context_params   params);
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
                             const char * path_model,
            struct llama_context_params   params),
            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
@ -384,27 +360,28 @@ extern "C" {
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
    // Token Id -> String. Uses the vocabulary in the provided context
    // Does not write null terminator to the buffer
    LLAMA_API int llama_token_to_str(
            const struct llama_context * ctx,
                           llama_token   token,
-                                  char * str,
+                                  char * buf,
                                  int    length);
    LLAMA_API int llama_token_to_str_bpe(
            const struct llama_context * ctx,
                           llama_token   token,
-                                  char * str,
+                                  char * buf,
                                  int    length);
    LLAMA_API int llama_token_to_str_with_model(
              const struct llama_model * model,
                           llama_token   token,
-                                  char * str,
+                                  char * buf,
                                  int    length);
    // Special tokens
-    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_bos(void);  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
+    LLAMA_API llama_token llama_token_eos(void);  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl();   // next-line
+    LLAMA_API llama_token llama_token_nl(void);   // next-line
    // Grammar
    //
@ -484,6 +461,10 @@ extern "C" {
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
 #ifdef __cplusplus
 }
 #endif